From 85a7b33b9653ade7b26b9f29765cb1f0719c1e84 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Thu, 26 Jul 2012 23:39:10 +0200
Subject: Btrfs: add rdev to get_inode_info in send/receive

We need rdev in the next commit.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe9..5721401 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -687,7 +687,8 @@ out:
  */
 static int get_inode_info(struct btrfs_root *root,
 			  u64 ino, u64 *size, u64 *gen,
-			  u64 *mode, u64 *uid, u64 *gid)
+			  u64 *mode, u64 *uid, u64 *gid,
+			  u64 *rdev)
 {
 	int ret;
 	struct btrfs_inode_item *ii;
@@ -721,6 +722,8 @@ static int get_inode_info(struct btrfs_root *root,
 		*uid = btrfs_inode_uid(path->nodes[0], ii);
 	if (gid)
 		*gid = btrfs_inode_gid(path->nodes[0], ii);
+	if (rdev)
+		*rdev = btrfs_inode_rdev(path->nodes[0], ii);
 
 out:
 	btrfs_free_path(path);
@@ -1081,7 +1084,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	 * There are inodes that have extents that lie behind it's i_size. Don't
 	 * accept clones from these extents.
 	 */
-	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+			NULL);
 	if (ret < 0)
 		return ret;
 
@@ -1404,7 +1408,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 	u64 right_gen;
 
 	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0 && ret != -ENOENT)
 		goto out;
 	left_ret = ret;
@@ -1413,7 +1417,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 		right_ret = -ENOENT;
 	} else {
 		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
-				NULL, NULL, NULL);
+				NULL, NULL, NULL, NULL);
 		if (ret < 0 && ret != -ENOENT)
 			goto out;
 		right_ret = ret;
@@ -1557,7 +1561,7 @@ static int get_first_ref(struct send_ctx *sctx,
 	btrfs_release_path(path);
 
 	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -1628,7 +1632,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 
 	if (other_inode > sctx->send_progress) {
 		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
-				who_gen, NULL, NULL, NULL);
+				who_gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 
@@ -1671,7 +1675,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 	}
 
 	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -2540,7 +2544,7 @@ int __finish_unordered_dir(int num, struct btrfs_key *di_key,
 	fs_path_reset(fctx->cur_path);
 
 	ret = get_inode_info(sctx->send_root, di_key->objectid,
-			NULL, &di_gen, &di_mode, NULL, NULL);
+			NULL, &di_gen, &di_mode, NULL, NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -2971,7 +2975,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 		return -ENOMEM;
 
 	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -3029,7 +3033,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 		return -ENOMEM;
 
 	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -3642,7 +3646,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 
 	if (clone_root2 == sctx->send_root) {
 		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
-				&gen, NULL, NULL, NULL);
+				&gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 		ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -4004,7 +4008,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		goto out;
 
 	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
-			&left_mode, &left_uid, &left_gid);
+			&left_mode, &left_uid, &left_gid, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -4015,7 +4019,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		} else {
 			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
 					NULL, NULL, &right_mode, &right_uid,
-					&right_gid);
+					&right_gid, NULL);
 			if (ret < 0)
 				goto out;
 
-- 
cgit v0.10.2


From 1f4692da951af4179a6522c6b48a09a43d37e614 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 10:42:24 +0200
Subject: Btrfs: fix cur_ino < parent_ino case for send/receive

When the current inodes inum is smaller then the inum of the
parent directory strange things were happending due to wrong
path resolution and other bugs. Fix this with a new approach
for the problem.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5721401..a5fae48 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
 	int cur_inode_new;
 	int cur_inode_new_gen;
 	int cur_inode_deleted;
-	int cur_inode_first_ref_orphan;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
 
@@ -2296,25 +2295,25 @@ out:
  * a valid path yet because we did not process the refs yet. So, the inode
  * is created as orphan.
  */
-static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
-			     struct btrfs_key *key)
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
 {
 	int ret = 0;
-	struct extent_buffer *eb = path->nodes[0];
-	struct btrfs_inode_item *ii;
 	struct fs_path *p;
-	int slot = path->slots[0];
 	int cmd;
+	u64 gen;
 	u64 mode;
+	u64 rdev;
 
-verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
 	p = fs_path_alloc(sctx);
 	if (!p)
 		return -ENOMEM;
 
-	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
-	mode = btrfs_inode_mode(eb, ii);
+	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
+			NULL, &rdev);
+	if (ret < 0)
+		goto out;
 
 	if (S_ISREG(mode))
 		cmd = BTRFS_SEND_C_MKFILE;
@@ -2339,22 +2338,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
 	if (ret < 0)
 		goto out;
 
-	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	ret = gen_unique_name(sctx, ino, gen, p);
 	if (ret < 0)
 		goto out;
 
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
 
 	if (S_ISLNK(mode)) {
 		fs_path_reset(p);
-		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+		ret = read_symlink(sctx, sctx->send_root, ino, p);
 		if (ret < 0)
 			goto out;
 		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
 	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
 		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
-		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
 	}
 
 	ret = send_cmd(sctx);
@@ -2368,6 +2367,92 @@ out:
 	return ret;
 }
 
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key di_key;
+	struct extent_buffer *eb;
+	struct btrfs_dir_item *di;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	while (1) {
+		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+				1, 0);
+		if (ret < 0)
+			goto out;
+		if (!ret) {
+			eb = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		}
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (di_key.objectid < sctx->send_progress) {
+			ret = 1;
+			goto out;
+		}
+
+		key.offset = found_key.offset + 1;
+		btrfs_release_path(path);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (S_ISDIR(sctx->cur_inode_mode)) {
+		ret = did_create_dir(sctx, sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	ret = send_create_inode(sctx, sctx->cur_ino);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
 struct recorded_ref {
 	struct list_head list;
 	char *dir_path;
@@ -2517,160 +2602,6 @@ out:
 	return ret;
 }
 
-struct finish_unordered_dir_ctx {
-	struct send_ctx *sctx;
-	struct fs_path *cur_path;
-	struct fs_path *dir_path;
-	u64 dir_ino;
-	int need_delete;
-	int delete_pass;
-};
-
-int __finish_unordered_dir(int num, struct btrfs_key *di_key,
-			   const char *name, int name_len,
-			   const char *data, int data_len,
-			   u8 type, void *ctx)
-{
-	int ret = 0;
-	struct finish_unordered_dir_ctx *fctx = ctx;
-	struct send_ctx *sctx = fctx->sctx;
-	u64 di_gen;
-	u64 di_mode;
-	int is_orphan = 0;
-
-	if (di_key->objectid >= fctx->dir_ino)
-		goto out;
-
-	fs_path_reset(fctx->cur_path);
-
-	ret = get_inode_info(sctx->send_root, di_key->objectid,
-			NULL, &di_gen, &di_mode, NULL, NULL, NULL);
-	if (ret < 0)
-		goto out;
-
-	ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
-			fctx->dir_ino, name, name_len);
-	if (ret < 0)
-		goto out;
-	if (ret) {
-		is_orphan = 1;
-		ret = gen_unique_name(sctx, di_key->objectid, di_gen,
-				fctx->cur_path);
-	} else {
-		ret = get_cur_path(sctx, di_key->objectid, di_gen,
-				fctx->cur_path);
-	}
-	if (ret < 0)
-		goto out;
-
-	ret = fs_path_add(fctx->dir_path, name, name_len);
-	if (ret < 0)
-		goto out;
-
-	if (!fctx->delete_pass) {
-		if (S_ISDIR(di_mode)) {
-			ret = send_rename(sctx, fctx->cur_path,
-					fctx->dir_path);
-		} else {
-			ret = send_link(sctx, fctx->dir_path,
-					fctx->cur_path);
-			if (is_orphan)
-				fctx->need_delete = 1;
-		}
-	} else if (!S_ISDIR(di_mode)) {
-		ret = send_unlink(sctx, fctx->cur_path);
-	} else {
-		ret = 0;
-	}
-
-	fs_path_remove(fctx->dir_path);
-
-out:
-	return ret;
-}
-
-/*
- * Go through all dir items and see if we find refs which could not be created
- * in the past because the dir did not exist at that time.
- */
-static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
-{
-	int ret = 0;
-	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *eb;
-	struct finish_unordered_dir_ctx fctx;
-	int slot;
-
-	path = alloc_path_for_send();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	memset(&fctx, 0, sizeof(fctx));
-	fctx.sctx = sctx;
-	fctx.cur_path = fs_path_alloc(sctx);
-	fctx.dir_path = fs_path_alloc(sctx);
-	if (!fctx.cur_path || !fctx.dir_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	fctx.dir_ino = dir;
-
-	ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * We do two passes. The first links in the new refs and the second
-	 * deletes orphans if required. Deletion of orphans is not required for
-	 * directory inodes, as we always have only one ref and use rename
-	 * instead of link for those.
-	 */
-
-again:
-	key.objectid = dir;
-	key.type = BTRFS_DIR_ITEM_KEY;
-	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-				1, 0);
-		if (ret < 0)
-			goto out;
-		eb = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_item_key_to_cpu(eb, &found_key, slot);
-
-		if (found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
-			btrfs_release_path(path);
-			break;
-		}
-
-		ret = iterate_dir_item(sctx, sctx->send_root, path,
-				&found_key, __finish_unordered_dir,
-				&fctx);
-		if (ret < 0)
-			goto out;
-
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(path);
-	}
-
-	if (!fctx.delete_pass && fctx.need_delete) {
-		fctx.delete_pass = 1;
-		goto again;
-	}
-
-out:
-	btrfs_free_path(path);
-	fs_path_free(sctx, fctx.cur_path);
-	fs_path_free(sctx, fctx.dir_path);
-	return ret;
-}
-
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
@@ -2678,6 +2609,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
 {
 	int ret = 0;
 	struct recorded_ref *cur;
+	struct recorded_ref *cur2;
 	struct ulist *check_dirs = NULL;
 	struct ulist_iterator uit;
 	struct ulist_node *un;
@@ -2735,6 +2667,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
 	list_for_each_entry(cur, &sctx->new_refs, list) {
 		/*
+		 * We may have refs where the parent directory does not exist
+		 * yet. This happens if the parent directories inum is higher
+		 * the the current inum. To handle this case, we create the
+		 * parent directory out of order. But we need to check if this
+		 * did already happen before due to other refs in the same dir.
+		 */
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+		if (ret == inode_state_will_create) {
+			ret = 0;
+			/*
+			 * First check if any of the current inodes refs did
+			 * already create the dir.
+			 */
+			list_for_each_entry(cur2, &sctx->new_refs, list) {
+				if (cur == cur2)
+					break;
+				if (cur2->dir == cur->dir) {
+					ret = 1;
+					break;
+				}
+			}
+
+			/*
+			 * If that did not happen, check if a previous inode
+			 * did already create the dir.
+			 */
+			if (!ret)
+				ret = did_create_dir(sctx, cur->dir);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_create_inode(sctx, cur->dir);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		/*
 		 * Check if this new ref would overwrite the first ref of
 		 * another unprocessed inode. If yes, orphanize the
 		 * overwritten inode. If we find an overwritten ref that is
@@ -2768,7 +2740,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * inode, move it and update valid_path. If not, link or move
 		 * it depending on the inode mode.
 		 */
-		if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+		if (is_orphan) {
 			ret = send_rename(sctx, valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
@@ -2844,35 +2816,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (!ret) {
-				/*
-				 * In case the inode was moved to a directory
-				 * that was not created yet (see
-				 * __record_new_ref), we can not unlink the ref
-				 * as it will be needed later when the parent
-				 * directory is created, so that we can move in
-				 * the inode to the new dir.
-				 */
-				if (!is_orphan &&
-				    sctx->cur_inode_first_ref_orphan) {
-					ret = orphanize_inode(sctx,
-							sctx->cur_ino,
-							sctx->cur_inode_gen,
-							cur->full_path);
-					if (ret < 0)
-						goto out;
-					ret = gen_unique_name(sctx,
-							sctx->cur_ino,
-							sctx->cur_inode_gen,
-							valid_path);
-					if (ret < 0)
-						goto out;
-					is_orphan = 1;
-
-				} else {
-					ret = send_unlink(sctx, cur->full_path);
-					if (ret < 0)
-						goto out;
-				}
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
 			}
 			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
 					GFP_NOFS);
@@ -2885,11 +2831,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * happen when a previous inode did overwrite the first ref
 		 * of this inode and no new refs were added for the current
 		 * inode.
-		 * We can however not delete the orphan in case the inode relies
-		 * in a directory that was not created yet (see
-		 * __record_new_ref)
 		 */
-		if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+		if (is_orphan) {
 			ret = send_unlink(sctx, valid_path);
 			if (ret < 0)
 				goto out;
@@ -2939,19 +2882,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 */
 	sctx->send_progress = sctx->cur_ino + 1;
 
-	/*
-	 * We may have a directory here that has pending refs which could not
-	 * be created before (because the dir did not exist before, see
-	 * __record_new_ref). finish_outoforder_dir will link/move the pending
-	 * refs.
-	 */
-	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
-		ret = finish_outoforder_dir(sctx, sctx->cur_ino,
-				sctx->cur_inode_gen);
-		if (ret < 0)
-			goto out;
-	}
-
 	ret = 0;
 
 out:
@@ -2979,31 +2909,6 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (ret < 0)
 		goto out;
 
-	/*
-	 * The parent may be non-existent at this point in time. This happens
-	 * if the ino of the parent dir is higher then the current ino. In this
-	 * case, we can not process this ref until the parent dir is finally
-	 * created. If we reach the parent dir later, process_recorded_refs
-	 * will go through all dir items and process the refs that could not be
-	 * processed before. In case this is the first ref, we set
-	 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
-	 * keep an orphan of the inode so that it later can be used for
-	 * link/move
-	 */
-	ret = is_inode_existent(sctx, dir, gen);
-	if (ret < 0)
-		goto out;
-	if (!ret) {
-		ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
-				name->start, fs_path_len(name));
-		if (ret < 0)
-			goto out;
-		if (ret)
-			sctx->cur_inode_first_ref_orphan = 1;
-		ret = 0;
-		goto out;
-	}
-
 	ret = get_cur_path(sctx, dir, gen, p);
 	if (ret < 0)
 		goto out;
@@ -4078,7 +3983,6 @@ static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
-	sctx->cur_inode_first_ref_orphan = 0;
 	sctx->send_progress = sctx->cur_ino;
 
 	if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4115,8 +4019,7 @@ static int changed_inode(struct send_ctx *sctx,
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->left_path->nodes[0], left_ii);
 		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
-			ret = send_create_inode(sctx, sctx->left_path,
-					sctx->cmp_key);
+			ret = send_create_inode_if_needed(sctx);
 	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
 		sctx->cur_inode_gen = right_gen;
 		sctx->cur_inode_new = 0;
@@ -4146,8 +4049,7 @@ static int changed_inode(struct send_ctx *sctx,
 					sctx->left_path->nodes[0], left_ii);
 			sctx->cur_inode_mode = btrfs_inode_mode(
 					sctx->left_path->nodes[0], left_ii);
-			ret = send_create_inode(sctx, sctx->left_path,
-					sctx->cmp_key);
+			ret = send_create_inode_if_needed(sctx);
 			if (ret < 0)
 				goto out;
 
-- 
cgit v0.10.2


From b9291affaa4576762c30fb31083f33f5d745dea1 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:07:18 +0200
Subject: Btrfs: add missing check for dir != tmp_dir to is_first_ref

We missed that check which resultet in all refs with the same name
being reported as first_ref.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a5fae48..bea5b43 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1589,7 +1589,7 @@ static int is_first_ref(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
-	if (name_len != fs_path_len(tmp_name)) {
+	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
 		ret = 0;
 		goto out;
 	}
-- 
cgit v0.10.2


From 9ea3ef516d828e435d283b7d5f0de05fd72a23ac Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:08:09 +0200
Subject: Btrfs: remove unused code with #if 0

fs_path_remove is not used at the moment due to a previous patch.
Remove it for now (with #if 0) to avoid compile warnings.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bea5b43..9e3f6d1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -327,6 +327,7 @@ out:
 	return ret;
 }
 
+#if 0
 static void fs_path_remove(struct fs_path *p)
 {
 	BUG_ON(p->reversed);
@@ -334,6 +335,7 @@ static void fs_path_remove(struct fs_path *p)
 		p->end--;
 	*p->end = 0;
 }
+#endif
 
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
-- 
cgit v0.10.2


From ccf1626b49a94d66f3bb58d634888049ac695b46 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:46:29 +0200
Subject: Btrfs: add correct parent to check_dirs when dir got moved

We only added the parent for the new position of a moved dir.
We also need to add the old parent of the moved dir.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9e3f6d1..328654e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2805,6 +2805,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 		}
+	} else if (S_ISDIR(sctx->cur_inode_mode) &&
+		   !list_empty(&sctx->deleted_refs)) {
+		/*
+		 * We have a moved dir. Add the old parent to check_dirs
+		 */
+		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+				list);
+		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+				GFP_NOFS);
+		if (ret < 0)
+			goto out;
 	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
 		/*
 		 * We have a non dir inode. Go through all deleted refs and
-- 
cgit v0.10.2


From d27aed5e24f8e7bb2b0d11e7a579f2bbdebafc2f Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:04:16 +0200
Subject: Btrfs: remove unused use_list from send/receive code

use_list is a leftover and unused.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 328654e..f1d44b1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -125,7 +125,6 @@ struct send_ctx {
 
 struct name_cache_entry {
 	struct list_head list;
-	struct list_head use_list;
 	u64 ino;
 	u64 gen;
 	u64 parent_ino;
@@ -1906,7 +1905,6 @@ out_cache:
 	nce->name_len = fs_path_len(dest);
 	nce->ret = ret;
 	strcpy(nce->name, dest->start);
-	memset(&nce->use_list, 0, sizeof(nce->use_list));
 
 	if (ino < sctx->send_progress)
 		nce->need_later_update = 0;
-- 
cgit v0.10.2


From ee849c0472a9fe1dc09fe8390965d993b9c4e979 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:42:05 +0200
Subject: Btrfs: rename backref_ctx::found_in_send_root to found_itself

The new name should be easier to understand/read.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f1d44b1..a25be0c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1029,7 +1029,7 @@ struct backref_ctx {
 	u64 extent_len;
 
 	/* Just to check for bugs in backref resolving */
-	int found_in_send_root;
+	int found_itself;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1077,7 +1077,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	if (found->root == bctx->sctx->send_root &&
 	    ino == bctx->cur_objectid &&
 	    offset == bctx->cur_offset) {
-		bctx->found_in_send_root = 1;
+		bctx->found_itself = 1;
 	}
 
 	/*
@@ -1210,7 +1210,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_ctx.found = 0;
 	backref_ctx.cur_objectid = ino;
 	backref_ctx.cur_offset = data_offset;
-	backref_ctx.found_in_send_root = 0;
+	backref_ctx.found_itself = 0;
 	backref_ctx.extent_len = num_bytes;
 
 	/*
@@ -1231,7 +1231,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
-	if (!backref_ctx.found_in_send_root) {
+	if (!backref_ctx.found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "
-- 
cgit v0.10.2


From 35075bb046cc91f42a0e5336bdc07f3279061add Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:44:34 +0200
Subject: Btrfs: use kmalloc instead of stack for backref_ctx

Make sure to never get in trouble due to the backref_ctx
which was on the stack before.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a25be0c..3671112 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1149,7 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 extent_item_pos;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
-	struct backref_ctx backref_ctx;
+	struct backref_ctx *backref_ctx = NULL;
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
@@ -1159,6 +1159,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (!tmp_path)
 		return -ENOMEM;
 
+	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+	if (!backref_ctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	if (data_offset >= ino_size) {
 		/*
 		 * There may be extents that lie behind the file's size.
@@ -1206,12 +1212,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 		cur_clone_root->found_refs = 0;
 	}
 
-	backref_ctx.sctx = sctx;
-	backref_ctx.found = 0;
-	backref_ctx.cur_objectid = ino;
-	backref_ctx.cur_offset = data_offset;
-	backref_ctx.found_itself = 0;
-	backref_ctx.extent_len = num_bytes;
+	backref_ctx->sctx = sctx;
+	backref_ctx->found = 0;
+	backref_ctx->cur_objectid = ino;
+	backref_ctx->cur_offset = data_offset;
+	backref_ctx->found_itself = 0;
+	backref_ctx->extent_len = num_bytes;
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1219,7 +1225,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	 * __iterate_backrefs work.
 	 */
 	if (data_offset + num_bytes >= ino_size)
-		backref_ctx.extent_len = ino_size - data_offset;
+		backref_ctx->extent_len = ino_size - data_offset;
 
 	/*
 	 * Now collect all backrefs.
@@ -1227,11 +1233,11 @@ static int find_extent_clone(struct send_ctx *sctx,
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
-					__iterate_backrefs, &backref_ctx);
+					__iterate_backrefs, backref_ctx);
 	if (ret < 0)
 		goto out;
 
-	if (!backref_ctx.found_itself) {
+	if (!backref_ctx->found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "
@@ -1246,7 +1252,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 		"num_bytes=%llu, logical=%llu\n",
 		data_offset, ino, num_bytes, logical);
 
-	if (!backref_ctx.found)
+	if (!backref_ctx->found)
 		verbose_printk("btrfs:    no clones found\n");
 
 	cur_clone_root = NULL;
@@ -1271,6 +1277,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 
 out:
 	btrfs_free_path(tmp_path);
+	kfree(backref_ctx);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 52f9e53ede8e1b261e68216c6c2f32bb3f26c795 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:32:45 +0200
Subject: Btrfs: use normal return path for root == send_root case

Don't have a seperate return path for the mentioned case. Now
we do the same "take lowest inode/offset" logic for all found clones.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3671112..d2a4ee9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1109,12 +1109,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 			return 0;
 		if (offset + ctx->extent_len > ctx->cur_offset)
 			return 0;*/
-
-		bctx->found++;
-		found->found_refs++;
-		found->ino = ino;
-		found->offset = offset;
-		return 0;
 	}
 
 	bctx->found++;
-- 
cgit v0.10.2


From adbe7fb6c4750621a56867d9bb1980da3a4b8f33 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:51:32 +0200
Subject: Btrfs: don't break in the final loop of find_extent_clone

If we break, we may miss the clone from send_root which we prefer
over all other clones.

Commit is a result of Arne's review.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d2a4ee9..68b2543 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1257,7 +1257,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 			else if (sctx->clone_roots[i].root == sctx->send_root)
 				/* prefer clones from send_root over others */
 				cur_clone_root = sctx->clone_roots + i;
-			break;
 		}
 
 	}
-- 
cgit v0.10.2


From 17589bd96eeb7c2ef2d3baeb05005a24932cd1e9 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:13:35 +0200
Subject: Btrfs: fix memory leak for name_cache in send/receive

When everything is done, name_cache_free is called which however
forgot to call kfree on the cache entries.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 68b2543..9cee678 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1822,6 +1822,7 @@ static void name_cache_free(struct send_ctx *sctx)
 
 	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
 		name_cache_delete(sctx, nce);
+		kfree(nce);
 	}
 }
 
-- 
cgit v0.10.2


From 7e0926fe5f20b5c88e51cba68512302b10f73d2a Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:20:58 +0200
Subject: Btrfs: fix use of radix_tree for name_cache in send/receive

We can't easily use the index of the radix tree for inums as the
radix tree uses 32bit indexes on 32bit kernels. For 32bit kernels,
we now use the lower 32bit of the inum as index and an additional
list to store multiple entries per radix tree entry.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9cee678..02e901a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -125,6 +125,15 @@ struct send_ctx {
 
 struct name_cache_entry {
 	struct list_head list;
+	/*
+	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
+	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
+	 * more then one inum would fall into the same entry, we use radix_list
+	 * to store the additional entries. radix_list is also used to store
+	 * entries where two entries have the same inum but different
+	 * generations.
+	 */
+	struct list_head radix_list;
 	u64 ino;
 	u64 gen;
 	u64 parent_ino;
@@ -1726,27 +1735,21 @@ static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
 {
 	int ret = 0;
-	struct name_cache_entry **ncea;
-
-	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-	if (ncea) {
-		if (!ncea[0])
-			ncea[0] = nce;
-		else if (!ncea[1])
-			ncea[1] = nce;
-		else
-			BUG();
-	} else {
-		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
-		if (!ncea)
+	struct list_head *nce_head;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+		if (!nce_head)
 			return -ENOMEM;
+		INIT_LIST_HEAD(nce_head);
 
-		ncea[0] = nce;
-		ncea[1] = NULL;
-		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
+		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
 		if (ret < 0)
 			return ret;
 	}
+	list_add_tail(&nce->radix_list, nce_head);
 	list_add_tail(&nce->list, &sctx->name_cache_list);
 	sctx->name_cache_size++;
 
@@ -1756,41 +1759,36 @@ static int name_cache_insert(struct send_ctx *sctx,
 static void name_cache_delete(struct send_ctx *sctx,
 			      struct name_cache_entry *nce)
 {
-	struct name_cache_entry **ncea;
-
-	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-	BUG_ON(!ncea);
-
-	if (ncea[0] == nce)
-		ncea[0] = NULL;
-	else if (ncea[1] == nce)
-		ncea[1] = NULL;
-	else
-		BUG();
+	struct list_head *nce_head;
 
-	if (!ncea[0] && !ncea[1]) {
-		radix_tree_delete(&sctx->name_cache, nce->ino);
-		kfree(ncea);
-	}
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	BUG_ON(!nce_head);
 
+	list_del(&nce->radix_list);
 	list_del(&nce->list);
-
 	sctx->name_cache_size--;
+
+	if (list_empty(nce_head)) {
+		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+		kfree(nce_head);
+	}
 }
 
 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
 						    u64 ino, u64 gen)
 {
-	struct name_cache_entry **ncea;
+	struct list_head *nce_head;
+	struct name_cache_entry *cur;
 
-	ncea = radix_tree_lookup(&sctx->name_cache, ino);
-	if (!ncea)
+	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+	if (!nce_head)
 		return NULL;
 
-	if (ncea[0] && ncea[0]->gen == gen)
-		return ncea[0];
-	else if (ncea[1] && ncea[1]->gen == gen)
-		return ncea[1];
+	list_for_each_entry(cur, nce_head, radix_list) {
+		if (cur->ino == ino && cur->gen == gen)
+			return cur;
+	}
 	return NULL;
 }
 
-- 
cgit v0.10.2


From 34d73f54e2e2227cece751f168d08d3103092992 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:18:58 +0200
Subject: Btrfs: make aux field of ulist 64 bit

Btrfs send/receive uses the aux field to store inode numbers. On
32 bit machines this may become a problem.

Also fix all users of ulist_add and ulist_add_merged.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f..6655ca6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			}
 			if (!ret) {
 				ret = ulist_add(parents, eb->start,
-						(unsigned long)eie, GFP_NOFS);
+						(u64)eie, GFP_NOFS);
 				if (ret < 0)
 					break;
 				if (!extent_item_pos) {
@@ -914,8 +914,8 @@ again:
 				free_extent_buffer(eb);
 			}
 			ret = ulist_add_merge(refs, ref->parent,
-					      (unsigned long)ref->inode_list,
-					      (unsigned long *)&eie, GFP_NOFS);
+					      (u64)ref->inode_list,
+					      (u64 *)&eie, GFP_NOFS);
 			if (!ret && extent_item_pos) {
 				/*
 				 * we've recorded that parent, so we must extend
@@ -1404,7 +1404,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&root_uiter);
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
-				 "%#lx\n", root_node->val, ref_node->val,
+				 "%#llx\n", root_node->val, ref_node->val,
 				 ref_node->aux);
 			ret = iterate_leaf_refs(
 				(struct extent_inode_elem *)ref_node->aux,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b650155..24f6adc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,7 +1145,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		ulist_reinit(tmp);
 						/* XXX id not needed */
-		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (unsigned long)glist->group,
+					  (u64)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1168,7 +1168,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	 * step 2: walk from the new root
 	 */
 	ulist_reinit(tmp);
-	ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(tmp, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(tmp, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 
@@ -1208,7 +1208,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 			continue;
 
 		ulist_reinit(tmp);
-		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (unsigned long)glist->group,
+					  (u64)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1469,7 +1469,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 	if (ret)
@@ -1541,7 +1541,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1553,7 +1553,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f4..99be4c1 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
  * In case of allocation failure -ENOMEM is returned and the ulist stays
  * unaltered.
  */
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      gfp_t gfp_mask)
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 {
 	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
 }
 
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-		    unsigned long *old_aux, gfp_t gfp_mask)
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask)
 {
 	int i;
 
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8e..21a1963 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
  */
 struct ulist_node {
 	u64 val;		/* value to store */
-	unsigned long aux;	/* auxiliary value saved along with the val */
+	u64 aux;		/* auxiliary value saved along with the val */
 };
 
 struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      gfp_t gfp_mask);
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-		    unsigned long *old_aux, gfp_t gfp_mask);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask);
 struct ulist_node *ulist_next(struct ulist *ulist,
 			      struct ulist_iterator *uiter);
 
-- 
cgit v0.10.2


From e479d9bb5f8366a064915b3c9ac47ed6e9fcc7d3 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:09:35 +0200
Subject: Btrfs: update send_progress at correct places

Updating send_progress in process_recorded_refs was not correct.
It got updated too early in the cur_inode_new_gen case.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 02e901a..f618224 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2886,12 +2886,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		}
 	}
 
-	/*
-	 * Current inode is now at it's new position, so we must increase
-	 * send_progress
-	 */
-	sctx->send_progress = sctx->cur_ino + 1;
-
 	ret = 0;
 
 out:
@@ -3896,6 +3890,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
 		goto out;
 
 	ret = process_recorded_refs(sctx);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We have processed the refs and thus need to advance send_progress.
+	 * Now, calls to get_cur_xxx will take the updated refs of the current
+	 * inode into account.
+	 */
+	sctx->send_progress = sctx->cur_ino + 1;
 
 out:
 	return ret;
@@ -3993,6 +3996,12 @@ static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
+
+	/*
+	 * Set send_progress to current inode. This will tell all get_cur_xxx
+	 * functions that the current inode's refs are not updated yet. Later,
+	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
+	 */
 	sctx->send_progress = sctx->cur_ino;
 
 	if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4066,6 +4075,11 @@ static int changed_inode(struct send_ctx *sctx,
 			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
 			if (ret < 0)
 				goto out;
+			/*
+			 * Advance send_progress now as we did not get into
+			 * process_recorded_refs_if_needed in the new_gen case.
+			 */
+			sctx->send_progress = sctx->cur_ino + 1;
 			ret = process_all_extents(sctx);
 			if (ret < 0)
 				goto out;
-- 
cgit v0.10.2


From 766702ef49b8b5299d819f3a0ac42613c23424d1 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:11:31 +0200
Subject: Btrfs: add/fix comments/documentation for send/receive

As the subject already said, add/fix comments.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f618224..6926546 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1067,6 +1067,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
 
 /*
  * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
  */
 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 {
@@ -1090,7 +1091,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	}
 
 	/*
-	 * There are inodes that have extents that lie behind it's i_size. Don't
+	 * There are inodes that have extents that lie behind its i_size. Don't
 	 * accept clones from these extents.
 	 */
 	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
@@ -1137,6 +1138,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 }
 
 /*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
  * path must point to the extent item when called.
  */
 static int find_extent_clone(struct send_ctx *sctx,
@@ -1529,6 +1536,10 @@ out:
 	return ret;
 }
 
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
 static int get_first_ref(struct send_ctx *sctx,
 			 struct btrfs_root *root, u64 ino,
 			 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1615,6 +1626,16 @@ out:
 	return ret;
 }
 
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 			      const char *name, int name_len,
 			      u64 *who_ino, u64 *who_gen)
@@ -1639,6 +1660,11 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 		goto out;
 	}
 
+	/*
+	 * Check if the overwritten ref was already processed. If yes, the ref
+	 * was already unlinked/moved, so we can safely assume that we will not
+	 * overwrite anything at this point in time.
+	 */
 	if (other_inode > sctx->send_progress) {
 		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
 				who_gen, NULL, NULL, NULL, NULL);
@@ -1655,6 +1681,13 @@ out:
 	return ret;
 }
 
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
 static int did_overwrite_ref(struct send_ctx *sctx,
 			    u64 dir, u64 dir_gen,
 			    u64 ino, u64 ino_gen,
@@ -1703,6 +1736,11 @@ out:
 	return ret;
 }
 
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 {
 	int ret = 0;
@@ -1731,6 +1769,11 @@ out:
 	return ret;
 }
 
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ */
 static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
 {
@@ -1792,12 +1835,19 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
 	return NULL;
 }
 
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
 static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
 {
 	list_del(&nce->list);
 	list_add_tail(&nce->list, &sctx->name_cache_list);
 }
 
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
 static void name_cache_clean_unused(struct send_ctx *sctx)
 {
 	struct name_cache_entry *nce;
@@ -1824,6 +1874,14 @@ static void name_cache_free(struct send_ctx *sctx)
 	}
 }
 
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
 				     u64 ino, u64 gen,
 				     u64 *parent_ino,
@@ -1835,6 +1893,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	struct btrfs_path *path = NULL;
 	struct name_cache_entry *nce = NULL;
 
+	/*
+	 * First check if we already did a call to this function with the same
+	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+	 * return the cached result.
+	 */
 	nce = name_cache_search(sctx, ino, gen);
 	if (nce) {
 		if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1857,6 +1920,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	if (!path)
 		return -ENOMEM;
 
+	/*
+	 * If the inode is not existent yet, add the orphan name and return 1.
+	 * This should only happen for the parent dir that we determine in
+	 * __record_new_ref
+	 */
 	ret = is_inode_existent(sctx, ino, gen);
 	if (ret < 0)
 		goto out;
@@ -1869,6 +1937,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 		goto out_cache;
 	}
 
+	/*
+	 * Depending on whether the inode was already processed or not, use
+	 * send_root or parent_root for ref lookup.
+	 */
 	if (ino < sctx->send_progress)
 		ret = get_first_ref(sctx, sctx->send_root, ino,
 				parent_ino, parent_gen, dest);
@@ -1878,6 +1950,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * Check if the ref was overwritten by an inode's ref that was processed
+	 * earlier. If yes, treat as orphan and return 1.
+	 */
 	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
 			dest->start, dest->end - dest->start);
 	if (ret < 0)
@@ -1891,6 +1967,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	}
 
 out_cache:
+	/*
+	 * Store the result of the lookup in the name cache.
+	 */
 	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
 	if (!nce) {
 		ret = -ENOMEM;
@@ -2278,7 +2357,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 			btrfs_inode_mtime(ii));
 	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
 			btrfs_inode_ctime(ii));
-	/* TODO otime? */
+	/* TODO Add otime support when the otime patches get into upstream */
 
 	ret = send_cmd(sctx);
 
@@ -2520,7 +2599,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
 }
 
 /*
- * Renames/moves a file/dir to it's orphan name. Used when the first
+ * Renames/moves a file/dir to its orphan name. Used when the first
  * ref of an unprocessed inode gets overwritten and for all non empty
  * directories.
  */
@@ -2840,7 +2919,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * If the inode is still orphan, unlink the orphan. This may
 		 * happen when a previous inode did overwrite the first ref
 		 * of this inode and no new refs were added for the current
-		 * inode.
+		 * inode. Unlinking does not mean that the inode is deleted in
+		 * all cases. There may still be links to this inode in other
+		 * places.
 		 */
 		if (is_orphan) {
 			ret = send_unlink(sctx, valid_path);
@@ -2857,6 +2938,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 */
 	ULIST_ITER_INIT(&uit);
 	while ((un = ulist_next(check_dirs, &uit))) {
+		/*
+		 * In case we had refs into dirs that were not processed yet,
+		 * we don't need to do the utime and rmdir logic for these dirs.
+		 * The dir will be processed later.
+		 */
 		if (un->val > sctx->cur_ino)
 			continue;
 
@@ -4048,7 +4134,17 @@ static int changed_inode(struct send_ctx *sctx,
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->right_path->nodes[0], right_ii);
 	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		/*
+		 * We need to do some special handling in case the inode was
+		 * reported as changed with a changed generation number. This
+		 * means that the original inode was deleted and new inode
+		 * reused the same inum. So we have to treat the old inode as
+		 * deleted and the new one as new.
+		 */
 		if (sctx->cur_inode_new_gen) {
+			/*
+			 * First, process the inode as if it was deleted.
+			 */
 			sctx->cur_inode_gen = right_gen;
 			sctx->cur_inode_new = 0;
 			sctx->cur_inode_deleted = 1;
@@ -4061,6 +4157,9 @@ static int changed_inode(struct send_ctx *sctx,
 			if (ret < 0)
 				goto out;
 
+			/*
+			 * Now process the inode as if it was new.
+			 */
 			sctx->cur_inode_gen = left_gen;
 			sctx->cur_inode_new = 1;
 			sctx->cur_inode_deleted = 0;
@@ -4080,6 +4179,11 @@ static int changed_inode(struct send_ctx *sctx,
 			 * process_recorded_refs_if_needed in the new_gen case.
 			 */
 			sctx->send_progress = sctx->cur_ino + 1;
+
+			/*
+			 * Now process all extents and xattrs of the inode as if
+			 * they were all new.
+			 */
 			ret = process_all_extents(sctx);
 			if (ret < 0)
 				goto out;
@@ -4102,6 +4206,16 @@ out:
 	return ret;
 }
 
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
 static int changed_ref(struct send_ctx *sctx,
 		       enum btrfs_compare_tree_result result)
 {
@@ -4122,6 +4236,11 @@ static int changed_ref(struct send_ctx *sctx,
 	return ret;
 }
 
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
 static int changed_xattr(struct send_ctx *sctx,
 			 enum btrfs_compare_tree_result result)
 {
@@ -4141,6 +4260,11 @@ static int changed_xattr(struct send_ctx *sctx,
 	return ret;
 }
 
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
 static int changed_extent(struct send_ctx *sctx,
 			  enum btrfs_compare_tree_result result)
 {
@@ -4157,7 +4281,10 @@ static int changed_extent(struct send_ctx *sctx,
 	return ret;
 }
 
-
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
 static int changed_cb(struct btrfs_root *left_root,
 		      struct btrfs_root *right_root,
 		      struct btrfs_path *left_path,
@@ -4229,7 +4356,8 @@ join_trans:
 	}
 
 	/*
-	 * Make sure the tree has not changed
+	 * Make sure the tree has not changed after re-joining. We detect this
+	 * by comparing start_ctransid and ctransid. They should always match.
 	 */
 	spin_lock(&send_root->root_times_lock);
 	ctransid = btrfs_root_ctransid(&send_root->root_item);
-- 
cgit v0.10.2


From e938c8ad543cd5ffe344371747484303ad51dfba Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:33:49 +0200
Subject: Btrfs: code cleanups for send/receive

Doing some code cleanups as suggested by Arne.
Changes do not change any logic.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6926546..3bc921a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1115,10 +1115,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 		 */
 		if (ino >= bctx->cur_objectid)
 			return 0;
-		/*if (ino > ctx->cur_objectid)
+#if 0
+		if (ino > bctx->cur_objectid)
+			return 0;
+		if (offset + bctx->extent_len > bctx->cur_offset)
 			return 0;
-		if (offset + ctx->extent_len > ctx->cur_offset)
-			return 0;*/
+#endif
 	}
 
 	bctx->found++;
@@ -1327,8 +1329,6 @@ static int read_symlink(struct send_ctx *sctx,
 	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
 
 	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-	if (ret < 0)
-		goto out;
 
 out:
 	btrfs_free_path(path);
@@ -1440,9 +1440,9 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 	}
 
 	if (!left_ret && !right_ret) {
-		if (left_gen == gen && right_gen == gen)
+		if (left_gen == gen && right_gen == gen) {
 			ret = inode_state_no_change;
-		else if (left_gen == gen) {
+		} else if (left_gen == gen) {
 			if (ino < sctx->send_progress)
 				ret = inode_state_did_create;
 			else
@@ -1615,11 +1615,7 @@ static int is_first_ref(struct send_ctx *sctx,
 		goto out;
 	}
 
-	ret = memcmp(tmp_name->start, name, name_len);
-	if (ret)
-		ret = 0;
-	else
-		ret = 1;
+	ret = !memcmp(tmp_name->start, name, name_len);
 
 out:
 	fs_path_free(sctx, tmp_name);
@@ -1761,8 +1757,6 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 
 	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
 			name->start, fs_path_len(name));
-	if (ret < 0)
-		goto out;
 
 out:
 	fs_path_free(sctx, name);
@@ -1866,9 +1860,10 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
 static void name_cache_free(struct send_ctx *sctx)
 {
 	struct name_cache_entry *nce;
-	struct name_cache_entry *tmp;
 
-	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+	while (!list_empty(&sctx->name_cache_list)) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
 		name_cache_delete(sctx, nce);
 		kfree(nce);
 	}
@@ -2188,9 +2183,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
 	btrfs_release_path(path);
 
-	if (ret < 0)
-		goto out;
-
 	if (parent_root) {
 		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
 		if (ret < 0)
@@ -2393,19 +2385,19 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	if (ret < 0)
 		goto out;
 
-	if (S_ISREG(mode))
+	if (S_ISREG(mode)) {
 		cmd = BTRFS_SEND_C_MKFILE;
-	else if (S_ISDIR(mode))
+	} else if (S_ISDIR(mode)) {
 		cmd = BTRFS_SEND_C_MKDIR;
-	else if (S_ISLNK(mode))
+	} else if (S_ISLNK(mode)) {
 		cmd = BTRFS_SEND_C_SYMLINK;
-	else if (S_ISCHR(mode) || S_ISBLK(mode))
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
 		cmd = BTRFS_SEND_C_MKNOD;
-	else if (S_ISFIFO(mode))
+	} else if (S_ISFIFO(mode)) {
 		cmd = BTRFS_SEND_C_MKFIFO;
-	else if (S_ISSOCK(mode))
+	} else if (S_ISSOCK(mode)) {
 		cmd = BTRFS_SEND_C_MKSOCK;
-	else {
+	} else {
 		printk(KERN_WARNING "btrfs: unexpected inode type %o",
 				(int)(mode & S_IFMT));
 		ret = -ENOTSUPP;
@@ -2583,13 +2575,13 @@ static int record_ref(struct list_head *head, u64 dir,
 static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 {
 	struct recorded_ref *cur;
-	struct recorded_ref *tmp;
 
-	list_for_each_entry_safe(cur, tmp, head, list) {
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct recorded_ref, list);
 		fs_path_free(sctx, cur->full_path);
+		list_del(&cur->list);
 		kfree(cur);
 	}
-	INIT_LIST_HEAD(head);
 }
 
 static void free_recorded_refs(struct send_ctx *sctx)
@@ -3205,24 +3197,18 @@ static int process_all_refs(struct send_ctx *sctx,
 	key.offset = 0;
 	while (1) {
 		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0) {
-			btrfs_release_path(path);
+		if (ret < 0)
 			goto out;
-		}
-		if (ret) {
-			btrfs_release_path(path);
+		if (ret)
 			break;
-		}
 
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
-			btrfs_release_path(path);
+		    found_key.type != key.type)
 			break;
-		}
 
 		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
 				&found_key, 0, cb, sctx);
@@ -3232,6 +3218,7 @@ static int process_all_refs(struct send_ctx *sctx,
 
 		key.offset = found_key.offset + 1;
 	}
+	btrfs_release_path(path);
 
 	ret = process_recorded_refs(sctx);
 
@@ -3554,7 +3541,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 	int ret = 0;
 	struct fs_path *p;
 	loff_t pos = offset;
-	int readed = 0;
+	int num_read = 0;
 	mm_segment_t old_fs;
 
 	p = fs_path_alloc(sctx);
@@ -3579,8 +3566,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
 	if (ret < 0)
 		goto out;
-	readed = ret;
-	if (!readed)
+	num_read = ret;
+	if (!num_read)
 		goto out;
 
 	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3593,7 +3580,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
-	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
 
 	ret = send_cmd(sctx);
 
@@ -3603,7 +3590,7 @@ out:
 	set_fs(old_fs);
 	if (ret < 0)
 		return ret;
-	return readed;
+	return num_read;
 }
 
 /*
@@ -3614,7 +3601,6 @@ static int send_clone(struct send_ctx *sctx,
 		      struct clone_root *clone_root)
 {
 	int ret = 0;
-	struct btrfs_root *clone_root2 = clone_root->root;
 	struct fs_path *p;
 	u64 gen;
 
@@ -3639,22 +3625,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 
-	if (clone_root2 == sctx->send_root) {
+	if (clone_root->root == sctx->send_root) {
 		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
 				&gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 		ret = get_cur_path(sctx, clone_root->ino, gen, p);
 	} else {
-		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+		ret = get_inode_path(sctx, clone_root->root,
+				clone_root->ino, p);
 	}
 	if (ret < 0)
 		goto out;
 
 	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-			clone_root2->root_item.uuid);
+			clone_root->root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-			clone_root2->root_item.ctransid);
+			clone_root->root->root_item.ctransid);
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
 			clone_root->offset);
-- 
cgit v0.10.2


From 3e126f32f88095ad1bd01b3c451e52aa9094f45c Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:03:09 +0200
Subject: Btrfs: remove unused tmp_path from iterate_dir_item

A leftover from older code and unused now.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3bc921a..b0f9df3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -864,7 +864,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 	struct extent_buffer *eb;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
-	struct btrfs_path *tmp_path = NULL;
 	struct btrfs_key di_key;
 	char *buf = NULL;
 	char *buf2 = NULL;
@@ -886,12 +885,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 		goto out;
 	}
 
-	tmp_path = alloc_path_for_send();
-	if (!tmp_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	eb = path->nodes[0];
 	slot = path->slots[0];
 	item = btrfs_item_nr(eb, slot);
@@ -953,7 +946,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 	}
 
 out:
-	btrfs_free_path(tmp_path);
 	if (buf_virtual)
 		vfree(buf);
 	else
-- 
cgit v0.10.2


From 5dc67d0ba915cda01ed3a980502945edf2c46b70 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:07:43 +0200
Subject: Btrfs: free nce and nce_head on error in name_cache_insert

Both were leaked in case of error.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b0f9df3..cfbe987 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1759,6 +1759,7 @@ out:
  * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
  * so we need to do some special handling in case we have clashes. This function
  * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
  */
 static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
@@ -1775,8 +1776,11 @@ static int name_cache_insert(struct send_ctx *sctx,
 		INIT_LIST_HEAD(nce_head);
 
 		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
-		if (ret < 0)
+		if (ret < 0) {
+			kfree(nce_head);
+			kfree(nce);
 			return ret;
+		}
 	}
 	list_add_tail(&nce->radix_list, nce_head);
 	list_add_tail(&nce->list, &sctx->name_cache_list);
-- 
cgit v0.10.2


From 3954096d4b1adab6fbb3c26de37ddd4f781e072f Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:46:05 +0200
Subject: Btrfs: fix check for changed extent in is_extent_unchanged

The previous check was working fine, but this check should be
easier to read. Also, we could theoritically have some exotic
bugs with the previous checks.

Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index cfbe987..35222d5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3819,8 +3819,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		/*
 		 * Check if we have the same extent.
 		 */
-		if (left_disknr + left_offset_fixed !=
-				right_disknr + right_offset) {
+		if (left_disknr != right_disknr ||
+		    left_offset_fixed != right_offset) {
 			ret = 0;
 			goto out;
 		}
-- 
cgit v0.10.2


From d8347fa444c682dc8a1e24e7158bfa2dd4c9ca71 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:49:15 +0200
Subject: Btrfs: use <= instead of < in is_extent_unchanged

Used the wrong compare operator here.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 35222d5..c6c10a4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3802,7 +3802,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * Are we at extent 8? If yes, we know the extent is changed.
 		 * This may only happen on the first iteration.
 		 */
-		if (found_key.offset + right_len < ekey->offset) {
+		if (found_key.offset + right_len <= ekey->offset) {
 			ret = 0;
 			goto out;
 		}
-- 
cgit v0.10.2


From 2f28f4787c0fc37c508cfb6b7b11c00ce5072940 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:42:14 +0200
Subject: Btrfs: pass root instead of parent_root to iterate_inode_ref

We need to pass the root that we determined earlier to iterate_inode_ref.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c6c10a4..db051d1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3206,8 +3206,8 @@ static int process_all_refs(struct send_ctx *sctx,
 		    found_key.type != key.type)
 			break;
 
-		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
-				&found_key, 0, cb, sctx);
+		ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
+				sctx);
 		btrfs_release_path(path);
 		if (ret < 0)
 			goto out;
-- 
cgit v0.10.2


From 2981e225f7048b36470383bf5622c42139bd96f7 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:47:03 +0200
Subject: Btrfs: ignore non-FS inodes for send/receive

We have to ignore inode/space cache objects in send/receive.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index db051d1..a4011a9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4287,6 +4287,11 @@ static int changed_cb(struct btrfs_root *left_root,
 	if (ret < 0)
 		goto out;
 
+	/* Ignore non-FS objects */
+	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+		goto out;
+
 	if (key->type == BTRFS_INODE_ITEM_KEY)
 		ret = changed_inode(sctx, result);
 	else if (key->type == BTRFS_INODE_REF_KEY)
-- 
cgit v0.10.2


From 6d85ed05e16e7ff747025c8374f23d7d81c98540 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:48:59 +0200
Subject: Btrfs: don't treat top/root directory inode as deleted/reused

We can't do the deleted/reused logic for top/root inodes as it would
create a stream that tries to delete and recreate the root dir.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a4011a9..d17d75e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2627,6 +2627,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 	struct btrfs_key loc;
 	struct btrfs_dir_item *di;
 
+	/*
+	 * Don't try to rmdir the top/root subvolume dir.
+	 */
+	if (dir == BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
 	path = alloc_path_for_send();
 	if (!path)
 		return -ENOMEM;
@@ -2687,6 +2693,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
+	/*
+	 * This should never happen as the root dir always has the same ref
+	 * which is always '..'
+	 */
+	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+
 	valid_path = fs_path_alloc(sctx);
 	if (!valid_path) {
 		ret = -ENOMEM;
@@ -4094,7 +4106,14 @@ static int changed_inode(struct send_ctx *sctx,
 
 		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
 				right_ii);
-		if (left_gen != right_gen)
+
+		/*
+		 * The cur_ino = root dir case is special here. We can't treat
+		 * the inode as deleted+reused because it would generate a
+		 * stream that tries to delete/mkdir the root dir.
+		 */
+		if (left_gen != right_gen &&
+		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
 			sctx->cur_inode_new_gen = 1;
 	}
 
-- 
cgit v0.10.2


From 74dd17fbe3d65829e75d84f00a9525b2ace93998 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@fusionio.com>
Date: Tue, 7 Aug 2012 16:25:13 -0400
Subject: Btrfs: fix btrfs send for inline items and compression

The btrfs send code was assuming the offset of the file item into the
extent translated to bytes on disk.  If we're compressed, this isn't
true, and so it was off into extents owned by other files.

It was also improperly handling inline extents.  This solves a crash
where we may have gone past the end of the file extent item by not
testing early enough for an inline extent.  It also solves problems
where we have a whole between the end of the inline item and the start
of the full extent.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f6..fcc8c21 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5073,6 +5073,7 @@ static void tree_move_down(struct btrfs_root *root,
 			   struct btrfs_path *path,
 			   int *level, int root_level)
 {
+	BUG_ON(*level == 0);
 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
 					path->slots[*level]);
 	path->slots[*level - 1] = 0;
@@ -5089,7 +5090,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
 
 	path->slots[*level]++;
 
-	while (path->slots[*level] == nritems) {
+	while (path->slots[*level] >= nritems) {
 		if (*level == root_level)
 			return -1;
 
@@ -5433,9 +5434,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 					goto out;
 				advance_right = ADVANCE;
 			} else {
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
 				ret = tree_compare_item(left_root, left_path,
 						right_path, tmp_buf);
 				if (ret) {
+					WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
 					ret = changed_cb(left_root, right_root,
 						left_path, right_path,
 						&left_key,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c87847..19319f5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4332,7 +4332,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
 		/* Should be safe to release our pages at this point */
 		btrfs_release_extent_buffer_page(eb, 0);
-
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 		return 1;
 	}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d17d75e..f22fdd4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1149,6 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	int ret;
 	int extent_type;
 	u64 logical;
+	u64 disk_byte;
 	u64 num_bytes;
 	u64 extent_item_pos;
 	struct btrfs_file_extent_item *fi;
@@ -1157,6 +1158,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
+	int compressed;
 	u32 i;
 
 	tmp_path = alloc_path_for_send();
@@ -1186,17 +1188,18 @@ static int find_extent_clone(struct send_ctx *sctx,
 		ret = -ENOENT;
 		goto out;
 	}
+	compressed = btrfs_file_extent_compression(eb, fi);
 
 	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
-	logical = btrfs_file_extent_disk_bytenr(eb, fi);
-	if (logical == 0) {
+	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+	if (disk_byte == 0) {
 		ret = -ENOENT;
 		goto out;
 	}
-	logical += btrfs_file_extent_offset(eb, fi);
+	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
 	ret = extent_from_logical(sctx->send_root->fs_info,
-			logical, tmp_path, &found_key);
+			disk_byte, tmp_path, &found_key);
 	btrfs_release_path(tmp_path);
 
 	if (ret < 0)
@@ -1234,10 +1237,16 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/*
 	 * Now collect all backrefs.
 	 */
+	if (compressed == BTRFS_COMPRESS_NONE)
+		extent_item_pos = logical - found_key.objectid;
+	else
+		extent_item_pos = 0;
+
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
 					__iterate_backrefs, backref_ctx);
+
 	if (ret < 0)
 		goto out;
 
@@ -1246,8 +1255,8 @@ static int find_extent_clone(struct send_ctx *sctx,
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "
 				"send_root. inode=%llu, offset=%llu, "
-				"logical=%llu\n",
-				ino, data_offset, logical);
+				"disk_byte=%llu found extent=%llu\n",
+				ino, data_offset, disk_byte, found_key.objectid);
 		goto out;
 	}
 
@@ -3678,10 +3687,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			struct btrfs_file_extent_item);
 	type = btrfs_file_extent_type(path->nodes[0], ei);
-	if (type == BTRFS_FILE_EXTENT_INLINE)
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
 		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
-	else
+		/*
+		 * it is possible the inline item won't cover the whole page,
+		 * but there may be items after this page.  Make
+		 * sure to send the whole thing
+		 */
+		len = PAGE_CACHE_ALIGN(len);
+	} else {
 		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+	}
 
 	if (offset + len > sctx->cur_inode_size)
 		len = sctx->cur_inode_size - offset;
@@ -3729,6 +3745,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	u64 left_offset_fixed;
 	u64 left_len;
 	u64 right_len;
+	u64 left_gen;
+	u64 right_gen;
 	u8 left_type;
 	u8 right_type;
 
@@ -3738,17 +3756,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 
 	eb = left_path->nodes[0];
 	slot = left_path->slots[0];
-
 	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 	left_type = btrfs_file_extent_type(eb, ei);
-	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-	left_len = btrfs_file_extent_num_bytes(eb, ei);
-	left_offset = btrfs_file_extent_offset(eb, ei);
 
 	if (left_type != BTRFS_FILE_EXTENT_REG) {
 		ret = 0;
 		goto out;
 	}
+	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+	left_len = btrfs_file_extent_num_bytes(eb, ei);
+	left_offset = btrfs_file_extent_offset(eb, ei);
+	left_gen = btrfs_file_extent_generation(eb, ei);
 
 	/*
 	 * Following comments will refer to these graphics. L is the left
@@ -3804,6 +3822,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
 		right_len = btrfs_file_extent_num_bytes(eb, ei);
 		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
 
 		if (right_type != BTRFS_FILE_EXTENT_REG) {
 			ret = 0;
@@ -3832,7 +3851,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * Check if we have the same extent.
 		 */
 		if (left_disknr != right_disknr ||
-		    left_offset_fixed != right_offset) {
+		    left_offset_fixed != right_offset ||
+		    left_gen != right_gen) {
 			ret = 0;
 			goto out;
 		}
-- 
cgit v0.10.2


From 995e01b7af745b8aaa5e882cfb7bfd5baab3f335 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Aug 2012 02:52:38 -0600
Subject: Btrfs: fix gcc warnings for 32bit compiles

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6655ca6..e600857 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			}
 			if (!ret) {
 				ret = ulist_add(parents, eb->start,
-						(u64)eie, GFP_NOFS);
+						(uintptr_t)eie, GFP_NOFS);
 				if (ret < 0)
 					break;
 				if (!extent_item_pos) {
@@ -363,8 +363,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&uiter);
 		node = ulist_next(parents, &uiter);
 		ref->parent = node ? node->val : 0;
-		ref->inode_list =
-			node ? (struct extent_inode_elem *)node->aux : 0;
+		ref->inode_list = node ?
+			(struct extent_inode_elem *)(uintptr_t)node->aux : 0;
 
 		/* additional parents require new refs being added here */
 		while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +375,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
-			new_ref->inode_list =
-					(struct extent_inode_elem *)node->aux;
+			new_ref->inode_list = (struct extent_inode_elem *)
+							(uintptr_t)node->aux;
 			list_add(&new_ref->list, &ref->list);
 		}
 		ulist_reinit(parents);
@@ -914,7 +914,7 @@ again:
 				free_extent_buffer(eb);
 			}
 			ret = ulist_add_merge(refs, ref->parent,
-					      (u64)ref->inode_list,
+					      (uintptr_t)ref->inode_list,
 					      (u64 *)&eie, GFP_NOFS);
 			if (!ret && extent_item_pos) {
 				/*
@@ -959,7 +959,7 @@ static void free_leaf_list(struct ulist *blocks)
 	while ((node = ulist_next(blocks, &uiter))) {
 		if (!node->aux)
 			continue;
-		eie = (struct extent_inode_elem *)node->aux;
+		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
 		for (; eie; eie = eie_next) {
 			eie_next = eie->next;
 			kfree(eie);
@@ -1405,11 +1405,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
 				 "%#llx\n", root_node->val, ref_node->val,
-				 ref_node->aux);
-			ret = iterate_leaf_refs(
-				(struct extent_inode_elem *)ref_node->aux,
-				root_node->val, extent_item_objectid,
-				iterate, ctx);
+				 (long long)ref_node->aux);
+			ret = iterate_leaf_refs((struct extent_inode_elem *)
+						(uintptr_t)ref_node->aux,
+						root_node->val,
+						extent_item_objectid,
+						iterate, ctx);
 		}
 		ulist_free(roots);
 		roots = NULL;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93c..54f0335 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,7 +25,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 24f6adc..9b707ba 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		ulist_reinit(tmp);
 						/* XXX id not needed */
-		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
 			if (qg->refcnt < seq)
 				qg->refcnt = seq + 1;
 			else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (u64)glist->group,
+					  (u64)(uintptr_t)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	 * step 2: walk from the new root
 	 */
 	ulist_reinit(tmp);
-	ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(tmp, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 		if (qg->refcnt < seq) {
 			/* not visited by step 1 */
 			qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(tmp, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 			continue;
 
 		ulist_reinit(tmp);
-		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
 			if (qg->tag == seq)
 				continue;
 
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (u64)glist->group,
+					  (uintptr_t)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1469,13 +1469,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
 		    qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 	if (ret)
@@ -1502,7 +1502,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		qg->reserved += num_bytes;
 	}
@@ -1541,19 +1541,19 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		qg->reserved -= num_bytes;
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f22fdd4..e5c8679 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1035,7 +1035,7 @@ struct backref_ctx {
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 {
-	u64 root = (u64)key;
+	u64 root = (u64)(uintptr_t)key;
 	struct clone_root *cr = (struct clone_root *)elt;
 
 	if (root < cr->root->objectid)
@@ -1069,7 +1069,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	u64 i_size;
 
 	/* First check if the root is in the list of accepted clone sources */
-	found = bsearch((void *)root, bctx->sctx->clone_roots,
+	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
 			bctx->sctx->clone_roots_cnt,
 			sizeof(struct clone_root),
 			__clone_root_cmp_bsearch);
-- 
cgit v0.10.2


From 7c735313bd1277c2eb28421934d4c7a0fa7339f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 13 Aug 2012 15:43:26 -0400
Subject: Btrfs: update last trans if we don't update the inode

There is a completely impossible situation to hit where you can preallocate
a file, fsync it, write into the preallocated region, have the transaction
commit twice and then fsync and then immediately lose power and lose all of
the contents of the write.  This patch fixes this just so I feel better
about the situation and because it is lightweight, we just update the
last_trans when we finish an ordered IO and we don't update the inode
itself.  This way we are completely safe and I feel better.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec154f9..6971bac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1949,6 +1949,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			btrfs_abort_transaction(trans, root, ret);
 			goto out_unlock;
 		}
+	} else {
+		btrfs_set_inode_last_trans(trans, inode);
 	}
 	ret = 0;
 out_unlock:
-- 
cgit v0.10.2


From 54338b5cc4fa3cfe260e8e4ade8b62a9079ea3f9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 14 Aug 2012 16:20:52 -0400
Subject: Btrfs: do not allocate chunks as agressively

Swinging this pendulum back the other way.  We've been allocating chunks up
to 2% of the disk no matter how much we actually have allocated.  So instead
fix this calculation to only allocate chunks if we have more than 80% of the
space available allocated.  Please test this as it will likely cause all
sorts of ENOSPC problems to pop up suddenly.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024..62fc92f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3504,7 +3504,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * and purposes it's used space.  Don't worry about locking the
 	 * global_rsv, it doesn't change except when the transaction commits.
 	 */
-	num_allocated += global_rsv->size;
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+		num_allocated += global_rsv->size;
 
 	/*
 	 * in limited mode, we want to have some free space up to
@@ -3518,15 +3519,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
-	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-	/* 256MB or 2% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-	/* system chunks need a much small threshold */
-	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		thresh = 32 * 1024 * 1024;
-
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
-- 
cgit v0.10.2


From 224ecce517af3a952321202cdf304c12e138caca Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 16 Aug 2012 16:32:06 -0400
Subject: Btrfs: fix possible corruption when fsyncing written prealloced
 extents

While working on my fsync patch my fsync tester kept hitting mismatching
md5sums when I would randomly write to a prealloc'ed region, syncfs() and
then write to the prealloced region some more and then fsync() and then
immediately reboot.  This is because the tree logging code will skip writing
csums for file extents who's generation is less than the current running
transaction.  When we mark extents as written we haven't been updating their
generation so they were always being skipped.  This wouldn't happen if you
were to preallocate and then write in the same transaction, but if you for
example prealloced a VM you could definitely run into this problem.  This
patch makes my fsync tester happy again.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285..b7c885c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -935,12 +935,16 @@ again:
 			btrfs_set_item_key_safe(trans, root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     end - orig_offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
 			btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +962,16 @@ again:
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
 			btrfs_set_item_key_safe(trans, root, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +999,14 @@ again:
 		leaf = path->nodes[0];
 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						split - key.offset);
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
@@ -1056,12 +1066,14 @@ again:
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
 		btrfs_mark_buffer_dirty(leaf);
-- 
cgit v0.10.2


From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 17 Aug 2012 13:14:17 -0400
Subject: Btrfs: turbo charge fsync

At least for the vm workload.  Currently on fsync we will

1) Truncate all items in the log tree for the given inode if they exist

and

2) Copy all items for a given inode into the log

The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing.  This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them.  We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already.  Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write

		Original	Patched
SATA drive	82KB/s		140KB/s
Fusion drive	431KB/s		2532KB/s

So around 2-6 times faster depending on your hardware.  There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok.  This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get.  All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.

The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok.  I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6b..7c7bf81 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
 #define BTRFS_INODE_DELALLOC_META_RESERVED	4
 #define BTRFS_INODE_HAS_ORPHAN_ITEM		5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
+#define BTRFS_INODE_NEEDS_FULL_SYNC		7
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d195b5..4b81ea3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3315,9 +3315,17 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+			       u64 start, u64 end, int skip_pinned,
+			       int modified);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *hint_byte, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, u64 *hint_byte, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b33..1fe82cf 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -35,6 +35,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
 	tree->map = RB_ROOT;
+	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
 
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
 	em->in_tree = 0;
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
+	em->generation = 0;
 	atomic_set(&em->refs, 1);
+	INIT_LIST_HEAD(&em->list);
 	return em;
 }
 
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
+		WARN_ON(!list_empty(&em->list));
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
@@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
+			if (merge->generation > em->generation) {
+				em->generation = merge->generation;
+				list_move(&em->list, &tree->modified_extents);
+			}
+
+			list_del_init(&merge->list);
 			rb_erase(&merge->rb_node, &tree->map);
 			free_extent_map(merge);
 		}
@@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
+		if (merge->generation > em->generation) {
+			em->generation = merge->generation;
+			list_move(&em->list, &tree->modified_extents);
+		}
+		list_del_init(&merge->list);
 		free_extent_map(merge);
 	}
 }
 
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpint_extent_cache - unpin an extent from the cache
+ * @tree:	tree to unpin the extent in
+ * @start:	logical offset in the file
+ * @len:	length of the extent
+ * @gen:	generation that this extent has been modified in
+ * @prealloc:	if this is set we need to clear the prealloc flag
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+		       u64 gen)
 {
 	int ret = 0;
 	struct extent_map *em;
@@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
 	if (!em)
 		goto out;
 
+	list_move(&em->list, &tree->modified_extents);
+	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 	try_merge_map(tree, em);
-
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
+	list_del_init(&em->list);
 	em->in_tree = 0;
 	return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09..2388a60 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -23,15 +23,18 @@ struct extent_map {
 	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
+	u64 generation;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
 	unsigned int in_tree;
 	unsigned int compress_type;
+	struct list_head list;
 };
 
 struct extent_map_tree {
 	struct rb_root map;
+	struct list_head modified_extents;
 	rwlock_t lock;
 };
 
@@ -60,7 +63,7 @@ struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b7c885c..399f9d7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -459,13 +459,14 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  * [start, end].  Existing extents are split as required.
  */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned)
+			       int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	u64 gen;
 	int ret;
 	int testend = 1;
 	unsigned long flags;
@@ -490,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			break;
 		}
 		flags = em->flags;
+		gen = em->generation;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
@@ -518,12 +520,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->block_len = em->block_len;
 			else
 				split->block_len = split->len;
-
+			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret); /* Logic error */
+			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
@@ -537,6 +540,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
+			split->generation = gen;
 
 			if (compressed) {
 				split->block_len = em->block_len;
@@ -550,6 +554,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret); /* Logic error */
+			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = NULL;
 		}
@@ -576,13 +581,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *hint_byte, int drop_cache)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
-	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 ino = btrfs_ino(inode);
@@ -597,14 +602,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 	int recow;
 	int ret;
 	int modify_tree = -1;
+	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	if (start >= BTRFS_I(inode)->disk_i_size)
 		modify_tree = 0;
 
@@ -707,7 +709,7 @@ next_slot:
 							extent_end - start);
 			btrfs_mark_buffer_dirty(leaf);
 
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
@@ -734,7 +736,7 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				inode_sub_bytes(inode, end - key.offset);
 				*hint_byte = disk_bytenr;
 			}
@@ -753,7 +755,7 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				inode_sub_bytes(inode, extent_end - start);
 				*hint_byte = disk_bytenr;
 			}
@@ -777,12 +779,13 @@ next_slot:
 				del_nr++;
 			}
 
-			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (update_refs &&
+			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
-			} else if (disk_bytenr > 0) {
+			} else if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
@@ -806,7 +809,7 @@ next_slot:
 					      del_nr);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
-				goto out;
+				break;
 			}
 
 			del_nr = 0;
@@ -825,7 +828,22 @@ next_slot:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 
-out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, u64 *hint_byte, int drop_cache)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
+				   hint_byte, drop_cache);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -892,8 +910,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 ino = btrfs_ino(inode);
 
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1556,6 +1572,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	    BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
+
+		/*
+		 * We'v had everything committed since the last time we were
+		 * modified so clear this flag in case it was set for whatever
+		 * reason, it's no longer relevant.
+		 */
+		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			  &BTRFS_I(inode)->runtime_flags);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6971bac..1b99fe8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
 				 &hint_byte, 1);
 	if (ret)
 		return ret;
@@ -1803,7 +1803,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes,
 				 &hint, 0);
 	if (ret)
 		goto out;
@@ -1929,11 +1930,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered_extent->file_offset,
-				   ordered_extent->len);
 	}
-
+	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+			   ordered_extent->file_offset, ordered_extent->len,
+			   trans->transid);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_unlock;
@@ -2592,6 +2592,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+	/*
+	 * If we were modified in the current generation and evicted from memory
+	 * and then re-read we need to do a full sync since we don't have any
+	 * idea about which extents were modified before we were evicted from
+	 * cache.
+	 */
+	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
@@ -3269,8 +3281,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = -1;
 
+	/*
+	 * We want to drop from the next block forward in case this new size is
+	 * not block aligned since we will be keeping the last block of the
+	 * extent just the way it is.
+	 */
 	if (root->ref_cows || root == root->fs_info->tree_root)
-		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+		btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
 
 	/*
 	 * This function is also used to drop the items in the log tree before
@@ -3579,6 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (oldsize + mask) & ~mask;
 	u64 block_end = (size + mask) & ~mask;
@@ -3615,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			struct extent_map *hole_em;
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
@@ -3624,7 +3643,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				break;
 			}
 
-			err = btrfs_drop_extents(trans, inode, cur_offset,
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
 						 cur_offset + hole_size,
 						 &hint_byte, 1);
 			if (err) {
@@ -3643,9 +3663,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				break;
 			}
 
-			btrfs_drop_extent_cache(inode, hole_start,
-					last_byte - 1, 0);
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + hole_size - 1, 0);
+			hole_em = alloc_extent_map();
+			if (!hole_em) {
+				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					&BTRFS_I(inode)->runtime_flags);
+				goto next;
+			}
+			hole_em->start = cur_offset;
+			hole_em->len = hole_size;
+			hole_em->orig_start = cur_offset;
+
+			hole_em->block_start = EXTENT_MAP_HOLE;
+			hole_em->block_len = 0;
+			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+			hole_em->compress_type = BTRFS_COMPRESS_NONE;
+			hole_em->generation = trans->transid;
 
+			while (1) {
+				write_lock(&em_tree->lock);
+				err = add_extent_mapping(em_tree, hole_em);
+				if (!err)
+					list_move(&hole_em->list,
+						  &em_tree->modified_extents);
+				write_unlock(&em_tree->lock);
+				if (err != -EEXIST)
+					break;
+				btrfs_drop_extent_cache(inode, cur_offset,
+							cur_offset +
+							hole_size - 1, 0);
+			}
+			free_extent_map(hole_em);
+next:
 			btrfs_update_inode(trans, root, inode);
 			btrfs_end_transaction(trans, root);
 		}
@@ -4673,6 +4723,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
 
+	/*
+	 * We could have gotten an inode number from somebody who was fsynced
+	 * and then removed in this same transaction, so let's just set full
+	 * sync since it will be a full sync anyway and this will blow away the
+	 * old info in the log.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
 	if (S_ISDIR(mode))
 		owner = 0;
 	else
@@ -6839,6 +6897,15 @@ static int btrfs_truncate(struct inode *inode)
 					   &BTRFS_I(inode)->runtime_flags))
 		btrfs_add_ordered_operation(trans, root, inode);
 
+	/*
+	 * So if we truncate and then write and fsync we normally would just
+	 * write the extents that changed, which is a problem if we need to
+	 * first truncate that entire inode.  So set this flag so we write out
+	 * all of the extents in the inode to the sync log so we're completely
+	 * safe.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
 	while (1) {
 		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 		if (ret) {
@@ -7510,6 +7577,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				       loff_t actual_len, u64 *alloc_hint,
 				       struct btrfs_trans_handle *trans)
 {
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
@@ -7550,6 +7619,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
+		em = alloc_extent_map();
+		if (!em) {
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+			goto next;
+		}
+
+		em->start = cur_offset;
+		em->orig_start = cur_offset;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->generation = trans->transid;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			if (!ret)
+				list_move(&em->list,
+					  &em_tree->modified_extents);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + ins.offset - 1,
+						0);
+		}
+		free_extent_map(em);
+next:
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		*alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa..9522322 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2576,7 +2576,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					datal -= off - key.offset;
 				}
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
 							 &hint_byte, 1);
@@ -2650,7 +2650,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
 							 &hint_byte, 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f..f2ff02c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, inode, start, extent_end,
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
 				 &alloc_hint, 1);
 	BUG_ON(ret);
 
@@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct extent_map *em1, *em2;
+
+	em1 = list_entry(a, struct extent_map, list);
+	em2 = list_entry(b, struct extent_map, list);
+
+	if (em1->start < em2->start)
+		return -1;
+	else if (em1->start > em2->start)
+		return 1;
+	return 0;
+}
+
+struct log_args {
+	struct extent_buffer *src;
+	u64 next_offset;
+	int start_slot;
+	int nr;
+};
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+			  struct inode *inode, struct btrfs_root *root,
+			  struct extent_map *em, struct btrfs_path *path,
+			  struct btrfs_path *dst_path, struct log_args *args)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 start = em->start;
+	u64 len = em->len;
+	u64 num_bytes;
+	int nritems;
+	int ret;
+
+	if (BTRFS_I(inode)->logged_trans == trans->transid) {
+		u64 tmp;
+		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+					   start + len, &tmp, 0);
+		if (ret)
+			return ret;
+	}
+
+	while (len) {
+		if (args->nr)
+			goto next_slot;
+		key.objectid = btrfs_ino(inode);
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			/*
+			 * This shouldn't happen, but it might so warn and
+			 * return an error.
+			 */
+			WARN_ON(1);
+			return -ENOENT;
+		}
+		args->src = path->nodes[0];
+next_slot:
+		fi = btrfs_item_ptr(args->src, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (args->nr &&
+		    args->start_slot + args->nr == path->slots[0]) {
+			args->nr++;
+		} else if (args->nr) {
+			ret = copy_items(trans, log, dst_path, args->src,
+					 args->start_slot, args->nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				return ret;
+			args->nr = 1;
+			args->start_slot = path->slots[0];
+		} else if (!args->nr) {
+			args->nr = 1;
+			args->start_slot = path->slots[0];
+		}
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+		if (len < num_bytes) {
+			/* I _think_ this is ok, envision we write to a
+			 * preallocated space that is adjacent to a previously
+			 * written preallocated space that gets merged when we
+			 * mark this preallocated space written.  If we do not
+			 * have the adjacent extent in cache then when we copy
+			 * this extent it could end up being larger than our EM
+			 * thinks it is, which is a-ok, so just set len to 0.
+			 */
+			len = 0;
+		} else {
+			len -= num_bytes;
+		}
+		start += btrfs_file_extent_num_bytes(args->src, fi);
+		args->next_offset = start;
+
+		if (path->slots[0] < nritems) {
+			if (len)
+				goto next_slot;
+			break;
+		}
+
+		if (args->nr) {
+			ret = copy_items(trans, log, dst_path, args->src,
+					 args->start_slot, args->nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				return ret;
+			args->nr = 0;
+			btrfs_release_path(path);
+		}
+	}
+
+	return 0;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct btrfs_path *dst_path)
+{
+	struct log_args args;
+	struct btrfs_root *log = root->log_root;
+	struct extent_map *em, *n;
+	struct list_head extents;
+	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+	u64 test_gen;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&extents);
+
+	memset(&args, 0, sizeof(args));
+
+	write_lock(&tree->lock);
+	test_gen = root->fs_info->last_trans_committed;
+
+	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+		list_del_init(&em->list);
+		if (em->generation <= test_gen)
+			continue;
+		list_add_tail(&em->list, &extents);
+	}
+
+	list_sort(NULL, &extents, extent_cmp);
+
+	while (!list_empty(&extents)) {
+		em = list_entry(extents.next, struct extent_map, list);
+
+		list_del_init(&em->list);
+
+		/*
+		 * If we had an error we just need to delete everybody from our
+		 * private list.
+		 */
+		if (ret)
+			continue;
+
+		/*
+		 * If the previous EM and the last extent we left off on aren't
+		 * sequential then we need to copy the items we have and redo
+		 * our search
+		 */
+		if (args.nr && em->start != args.next_offset) {
+			ret = copy_items(trans, log, dst_path, args.src,
+					 args.start_slot, args.nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				continue;
+			btrfs_release_path(path);
+			args.nr = 0;
+		}
+
+		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+	}
+
+	if (!ret && args.nr)
+		ret = copy_items(trans, log, dst_path, args.src,
+				 args.start_slot, args.nr, LOG_INODE_ALL);
+	btrfs_release_path(path);
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
+	return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	int nritems;
 	int ins_start_slot = 0;
 	int ins_nr;
+	bool fast_search = false;
 	u64 ino = btrfs_ino(inode);
 
 	log = root->log_root;
@@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	max_key.objectid = ino;
 
-	/* today the code can only do partial logging of directories */
-	if (!S_ISDIR(inode->i_mode))
-	    inode_only = LOG_INODE_ALL;
 
+	/* today the code can only do partial logging of directories */
 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
@@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
 	} else {
-		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			ret = btrfs_truncate_inode_items(trans, log,
+							 inode, 0, 0);
+		} else {
+			fast_search = true;
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
+			ret = drop_objectid_items(trans, log, path, ino,
+						  BTRFS_XATTR_ITEM_KEY);
+		}
 	}
 	if (ret) {
 		err = ret;
@@ -2960,7 +3157,18 @@ next_slot:
 		}
 		ins_nr = 0;
 	}
-	WARN_ON(ins_nr);
+
+	if (fast_search) {
+		btrfs_release_path(path);
+		btrfs_release_path(dst_path);
+		ret = btrfs_log_changed_extents(trans, root, inode, path,
+						dst_path);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	}
+
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(path);
 		btrfs_release_path(dst_path);
-- 
cgit v0.10.2


From 0fa83cdb1d72a94ea84ab6380747de6ac7cc8753 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 24 Aug 2012 14:48:11 -0400
Subject: Btrfs: only warn if we hit an error when doing the tree logging

I hit this a couple times while working on my fsync patch (all my bugs, not
normal operation), but with my new stuff we could have new errors from cases
I have not encountered, so instead of BUG()'ing we should be WARN()'ing so
that we are notified there is a problem but the user doesn't lose their
data.  We can easily commit the transaction in the case that the tree
logging fails and still be fine, so let's try and be as nice to the user as
possible.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f2ff02c..94db438 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3346,7 +3346,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOSPC);
+		WARN_ON(ret != -ENOSPC);
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}
-- 
cgit v0.10.2


From 06d3d22b456c2f87aeb1eb4517eeabb47e21fcc9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 27 Aug 2012 10:52:19 -0600
Subject: Btrfs: cleanup extents after we finish logging inode

This is based on Josef's "Btrfs: turbo charge fsync".

We should cleanup those extents after we've finished logging inode,
otherwise we may do redundant work on them.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 94db438..58075d7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3167,6 +3167,12 @@ next_slot:
 			err = ret;
 			goto out_unlock;
 		}
+	} else {
+		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_map *em, *n;
+
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+			list_del_init(&em->list);
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-- 
cgit v0.10.2


From ca7e70f59078046db28501519308c2061b0e7a6f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 27 Aug 2012 17:48:15 -0400
Subject: Btrfs: do not needlessly restart the transaction for enospc

We will stop and restart a transaction every time we move to a different leaf
when truncating a file.  This is for enospc reasons, but really we could
probably get away with doing this a little better by actually working until we
hit an ENOSPC.  So add a ->failfast flag to the block_rsv and set it when we do
truncates which will fail as soon as the block rsv runs out of space, and then
at that point we can stop and restart the transaction and refill the block rsv
and carry on.  This will make rm'ing of a file with lots of extents a bit
faster.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4b81ea3..81c772b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1034,6 +1034,7 @@ struct btrfs_block_rsv {
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
 	unsigned int full;
+	unsigned int failfast;
 };
 
 /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62fc92f..df2f17b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6308,7 +6308,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	ret = block_rsv_use_bytes(block_rsv, blocksize);
 	if (!ret)
 		return block_rsv;
-	if (ret) {
+	if (ret && !block_rsv->failfast) {
 		static DEFINE_RATELIMIT_STATE(_rs,
 				DEFAULT_RATELIMIT_INTERVAL,
 				/*DEFAULT_RATELIMIT_BURST*/ 2);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1b99fe8..ca4fa05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3448,12 +3448,6 @@ delete:
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot) {
-			if (root->ref_cows &&
-			    BTRFS_I(inode)->location.objectid !=
-						BTRFS_FREE_INO_OBJECTID) {
-				err = -EAGAIN;
-				goto out;
-			}
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -3826,6 +3820,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 	rsv->size = min_size;
+	rsv->failfast = 1;
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	btrfs_i_size_write(inode, 0);
@@ -3870,7 +3865,7 @@ void btrfs_evict_inode(struct inode *inode)
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -EAGAIN)
+		if (ret != -ENOSPC)
 			break;
 
 		nr = trans->blocks_used;
@@ -6852,6 +6847,7 @@ static int btrfs_truncate(struct inode *inode)
 	if (!rsv)
 		return -ENOMEM;
 	rsv->size = min_size;
+	rsv->failfast = 1;
 
 	/*
 	 * 1 for the truncate slack space
@@ -6905,37 +6901,13 @@ static int btrfs_truncate(struct inode *inode)
 	 * safe.
 	 */
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	trans->block_rsv = rsv;
 
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size);
-		if (ret) {
-			/*
-			 * This can only happen with the original transaction we
-			 * started above, every other time we shouldn't have a
-			 * transaction started yet.
-			 */
-			if (ret == -EAGAIN)
-				goto end_trans;
-			err = ret;
-			break;
-		}
-
-		if (!trans) {
-			/* Just need the 1 for updating the inode */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				ret = err = PTR_ERR(trans);
-				trans = NULL;
-				break;
-			}
-		}
-
-		trans->block_rsv = rsv;
-
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
-		if (ret != -EAGAIN) {
+		if (ret != -ENOSPC) {
 			err = ret;
 			break;
 		}
@@ -6946,11 +6918,22 @@ static int btrfs_truncate(struct inode *inode)
 			err = ret;
 			break;
 		}
-end_trans:
+
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
+
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
-- 
cgit v0.10.2


From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 27 Aug 2012 10:52:20 -0600
Subject: Btrfs: improve fsync by filtering extents that we want

This is based on Josef's "Btrfs: turbo charge fsync".

The above Josef's patch performs very good in random sync write test,
because we won't have too much extents to merge.

However, it does not performs good on the test:
dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync

The reason is when we do sequencial sync write, we need to merge the
current extent just with the previous one, so that we can get accumulated
extents to log:

A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...

So we'll have to flush more and more checksum into log tree, which is the
bottleneck according to my tests.

But we can avoid this by telling fsync the real extents that are needed
to be logged.

With this, I did the above dd sync write test (size=50m),

         w/o (orig)   w/ (josef's)   w/ (this)
SATA      104KB/s       109KB/s       121KB/s
ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1fe82cf..ac606f0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			if (merge->generation > em->generation) {
+				em->mod_start = em->start;
+				em->mod_len = em->len;
 				em->generation = merge->generation;
 				list_move(&em->list, &tree->modified_extents);
 			}
@@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		if (merge->generation > em->generation) {
+			em->mod_len = em->len;
 			em->generation = merge->generation;
 			list_move(&em->list, &tree->modified_extents);
 		}
@@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 {
 	int ret = 0;
 	struct extent_map *em;
+	bool prealloc = false;
 
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
@@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 	list_move(&em->list, &tree->modified_extents);
 	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+	}
 
 	try_merge_map(tree, em);
+
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
+
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	}
 	atomic_inc(&em->refs);
 
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
 	try_merge_map(tree, em);
 out:
 	return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 2388a60..8e6294b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -20,6 +20,8 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 mod_start;
+	u64 mod_len;
 	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca4fa05..878116d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1308,6 +1308,7 @@ out_check:
 			em->block_start = disk_bytenr;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 			while (1) {
 				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 58075d7..71e7153 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
-	u64 start = em->start;
-	u64 len = em->len;
+	u64 start = em->mod_start;
+	u64 len = em->mod_len;
 	u64 num_bytes;
 	int nritems;
 	int ret;
@@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		 * sequential then we need to copy the items we have and redo
 		 * our search
 		 */
-		if (args.nr && em->start != args.next_offset) {
+		if (args.nr && em->mod_start != args.next_offset) {
 			ret = copy_items(trans, log, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
-- 
cgit v0.10.2


From 321f0e70225abc792d74902a2bc4a60164265fd4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 28 Aug 2012 22:13:02 -0600
Subject: Btrfs: fix wrong orphan count of the fs/file tree

If we add a new orphan item, we should increase the atomic counter,
not decrease it. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 878116d..a6824bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2228,7 +2228,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 			insert = 1;
 #endif
 		insert = 1;
-		atomic_dec(&root->orphan_inodes);
+		atomic_inc(&root->orphan_inodes);
 	}
 
 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-- 
cgit v0.10.2


From 46d8bc34248f3a94dea910137d1ddf5fb1e3a1cc Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 29 Aug 2012 01:07:55 -0600
Subject: Btrfs: fix a bug in checking whether a inode is already in log

This is based on Josef's "Btrfs: turbo charge fsync".

The current btrfs checks if an inode is in log by comparing
root's last_log_commit to inode's last_sub_trans[2].

But the problem is that this root->last_log_commit is shared among
inodes.

Say we have N inodes to be logged, after the first inode,
root's last_log_commit is updated and the N-1 remained files will
be skipped.

This fixes the bug by keeping a local copy of root's last_log_commit
inside each inode and this local copy will be maintained itself.

[1]: we regard each log transaction as a subset of btrfs's transaction,
i.e. sub_trans

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7c7bf81..ed8ca7c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -144,6 +144,9 @@ struct btrfs_inode {
 	/* flags field from the on disk inode */
 	u32 flags;
 
+	/* a local copy of root's last_log_commit */
+	unsigned long last_log_commit;
+
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
@@ -203,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
-
-	mutex_lock(&root->log_mutex);
 	if (BTRFS_I(inode)->logged_trans == generation &&
-	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-		ret = 1;
-	mutex_unlock(&root->log_mutex);
-	return ret;
+	    BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+		return 1;
+	return 0;
 }
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6824bd..24745b8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6774,6 +6774,7 @@ again:
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
@@ -7018,6 +7019,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->last_unlink_trans = 0;
+	ei->last_log_commit = 0;
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416..1a138bf 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -88,6 +88,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 71e7153..fc0df95 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3185,6 +3185,7 @@ next_slot:
 		}
 	}
 	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
-- 
cgit v0.10.2


From d2794405113cd04f13a58e32e4d541dc87ec86e4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 29 Aug 2012 01:07:56 -0600
Subject: Btrfs: check if an inode has no checksum when logging it

This is based on Josef's "Btrfs: turbo charge fsync".

If an inode is a BTRFS_INODE_NODATASUM one, we don't need to look for csum
items any more.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc0df95..5e2ffb9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2680,13 +2680,14 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 }
 
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *log,
+			       struct inode *inode,
 			       struct btrfs_path *dst_path,
 			       struct extent_buffer *src,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
 	unsigned long dst_offset;
+	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_inode_item *inode_item;
 	int ret;
@@ -2695,6 +2696,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	char *ins_data;
 	int i;
 	struct list_head ordered_sums;
+	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -2745,7 +2747,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 		 * or deletes of this inode don't have to relog the inode
 		 * again
 		 */
-		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+		    !skip_csum) {
 			int found_type;
 			extent = btrfs_item_ptr(src, start_slot + i,
 						struct btrfs_file_extent_item);
@@ -2873,7 +2876,7 @@ next_slot:
 		    args->start_slot + args->nr == path->slots[0]) {
 			args->nr++;
 		} else if (args->nr) {
-			ret = copy_items(trans, log, dst_path, args->src,
+			ret = copy_items(trans, inode, dst_path, args->src,
 					 args->start_slot, args->nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2910,7 +2913,7 @@ next_slot:
 		}
 
 		if (args->nr) {
-			ret = copy_items(trans, log, dst_path, args->src,
+			ret = copy_items(trans, inode, dst_path, args->src,
 					 args->start_slot, args->nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2930,7 +2933,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *dst_path)
 {
 	struct log_args args;
-	struct btrfs_root *log = root->log_root;
 	struct extent_map *em, *n;
 	struct list_head extents;
 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -2971,7 +2973,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		 * our search
 		 */
 		if (args.nr && em->mod_start != args.next_offset) {
-			ret = copy_items(trans, log, dst_path, args.src,
+			ret = copy_items(trans, inode, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2984,7 +2986,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 
 	if (!ret && args.nr)
-		ret = copy_items(trans, log, dst_path, args.src,
+		ret = copy_items(trans, inode, dst_path, args.src,
 				 args.start_slot, args.nr, LOG_INODE_ALL);
 	btrfs_release_path(path);
 	WARN_ON(!list_empty(&extents));
@@ -3109,7 +3111,7 @@ again:
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;
@@ -3127,7 +3129,7 @@ next_slot:
 			goto again;
 		}
 		if (ins_nr) {
-			ret = copy_items(trans, log, dst_path, src,
+			ret = copy_items(trans, inode, dst_path, src,
 					 ins_start_slot,
 					 ins_nr, inode_only);
 			if (ret) {
@@ -3148,8 +3150,7 @@ next_slot:
 			break;
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, log, dst_path, src,
-				 ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;
-- 
cgit v0.10.2


From 2671485d395c07fca104c972785898d7c52fc942 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 29 Aug 2012 12:24:27 -0400
Subject: Btrfs: remove unused hint byte argument for btrfs_drop_extents

I audited all users of btrfs_drop_extents and found that nobody actually uses
the hint_byte argument.  I'm sure it was used for something at some point but
it's not used now, and the way the pinning works the disk bytenr would never be
immediately useful anyway so lets just remove it.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 81c772b..71d6ff1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3323,10 +3323,10 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *hint_byte, int drop_cache);
+			 int drop_cache);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
-		       u64 end, u64 *hint_byte, int drop_cache);
+		       u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 399f9d7..58598c2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -584,7 +584,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *hint_byte, int drop_cache)
+			 int drop_cache)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -716,7 +716,6 @@ next_slot:
 						new_key.objectid,
 						start - extent_offset, 0);
 				BUG_ON(ret); /* -ENOMEM */
-				*hint_byte = disk_bytenr;
 			}
 			key.offset = start;
 		}
@@ -736,10 +735,8 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (update_refs && disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, end - key.offset);
-				*hint_byte = disk_bytenr;
-			}
 			break;
 		}
 
@@ -755,10 +752,8 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (update_refs && disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, extent_end - start);
-				*hint_byte = disk_bytenr;
-			}
 			if (end == extent_end)
 				break;
 
@@ -794,7 +789,6 @@ next_slot:
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
-				*hint_byte = disk_bytenr;
 			}
 
 			if (end == extent_end)
@@ -834,7 +828,7 @@ next_slot:
 
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
-		       u64 end, u64 *hint_byte, int drop_cache)
+		       u64 end, int drop_cache)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -843,7 +837,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
-				   hint_byte, drop_cache);
+				   drop_cache);
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 24745b8..f0a4792 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	u64 inline_len = actual_end - start;
 	u64 aligned_end = (end + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-	u64 hint_byte;
 	u64 data_len = inline_len;
 	int ret;
 
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
-				 &hint_byte, 1);
+	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
 	if (ret)
 		return ret;
 
@@ -1786,7 +1784,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
-	u64 hint;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1805,8 +1802,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * with the others.
 	 */
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes,
-				 &hint, 0);
+				 file_pos + num_bytes, 0);
 	if (ret)
 		goto out;
 
@@ -3629,7 +3625,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		last_byte = (last_byte + mask) & ~mask;
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
-			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
 			trans = btrfs_start_transaction(root, 3);
@@ -3640,8 +3635,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 			err = btrfs_drop_extents(trans, root, inode,
 						 cur_offset,
-						 cur_offset + hole_size,
-						 &hint_byte, 1);
+						 cur_offset + hole_size, 1);
 			if (err) {
 				btrfs_abort_transaction(trans, root, err);
 				btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9522322..5543fd5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2353,7 +2353,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	int ret;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
-	u64 hint_byte;
 
 	/*
 	 * TODO:
@@ -2579,7 +2578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
+							 1);
 				if (ret) {
 					btrfs_abort_transaction(trans, root,
 								ret);
@@ -2653,7 +2652,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
+							 1);
 				if (ret) {
 					btrfs_abort_transaction(trans, root,
 								ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5e2ffb9..0c39f58 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -485,7 +485,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	int found_type;
 	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
-	u64 alloc_hint;
 	u64 start = key->offset;
 	u64 saved_nbytes;
 	struct btrfs_file_extent_item *item;
@@ -551,8 +550,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
-				 &alloc_hint, 1);
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2843,9 +2841,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (BTRFS_I(inode)->logged_trans == trans->transid) {
-		u64 tmp;
 		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-					   start + len, &tmp, 0);
+					   start + len, 0);
 		if (ret)
 			return ret;
 	}
-- 
cgit v0.10.2


From 2aaa66558172b017f36bf38ae69372813dedee9d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 29 Aug 2012 14:27:18 -0400
Subject: Btrfs: add hole punching

This patch adds hole punching via fallocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 71d6ff1..88adfe6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3250,6 +3250,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct inode *dir, u64 objectid,
 			const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct inode *inode, u64 new_size,
@@ -3323,7 +3325,7 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 int drop_cache);
+			 u64 *drop_end, int drop_cache);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
 		       u64 end, int drop_cache);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df2f17b..ee51b7a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4132,6 +4132,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
+	if (!rsv)
+		return;
 	btrfs_block_rsv_release(root, rsv, (u64)-1);
 	kfree(rsv);
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58598c2..57026a6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 
 /*
  * when auto defrag is enabled we
@@ -584,7 +585,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 int drop_cache)
+			 u64 *drop_end, int drop_cache)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -822,6 +823,8 @@ next_slot:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 
+	if (drop_end)
+		*drop_end = min(end, extent_end);
 	btrfs_release_path(path);
 	return ret;
 }
@@ -836,7 +839,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
 				   drop_cache);
 	btrfs_free_path(path);
 	return ret;
@@ -1645,6 +1648,324 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+			  int slot, u64 start, u64 end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_disk_bytenr(leaf, fi))
+		return 0;
+
+	if (key.offset == end)
+		return 1;
+	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+		return 1;
+	return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+		      struct btrfs_path *path, u64 offset, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct extent_map *hole_em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(!ret);
+
+	leaf = path->nodes[0];
+	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]--;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+			end - offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+
+	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]++;
+		key.offset = offset;
+		btrfs_set_item_key_safe(trans, root, path, &key);
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+			offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, end - offset, 0, end - offset,
+				       0, 0, 0);
+	if (ret)
+		return ret;
+
+out:
+	btrfs_release_path(path);
+
+	hole_em = alloc_extent_map();
+	if (!hole_em) {
+		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+	} else {
+		hole_em->start = offset;
+		hole_em->len = end - offset;
+		hole_em->orig_start = offset;
+
+		hole_em->block_start = EXTENT_MAP_HOLE;
+		hole_em->block_len = 0;
+		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+		hole_em->compress_type = BTRFS_COMPRESS_NONE;
+		hole_em->generation = trans->transid;
+
+		do {
+			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, hole_em);
+			if (!ret)
+				list_move(&hole_em->list,
+					  &em_tree->modified_extents);
+			write_unlock(&em_tree->lock);
+		} while (ret == -EEXIST);
+		free_extent_map(hole_em);
+		if (ret)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	}
+
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *rsv;
+	struct btrfs_trans_handle *trans;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	u64 lockstart = (offset + mask) & ~mask;
+	u64 lockend = ((offset + len) & ~mask) - 1;
+	u64 cur_offset = lockstart;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	u64 drop_end;
+	unsigned long nr;
+	int ret = 0;
+	int err = 0;
+	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+		((offset + len) >> PAGE_CACHE_SHIFT);
+
+	btrfs_wait_ordered_range(inode, offset, len);
+
+	mutex_lock(&inode->i_mutex);
+	if (offset >= inode->i_size) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	/*
+	 * Only do this if we are in the same page and we aren't doing the
+	 * entire page.
+	 */
+	if (same_page && len < PAGE_CACHE_SIZE) {
+		ret = btrfs_truncate_page(inode, offset, len, 0);
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	/* zero back part of the first page */
+	ret = btrfs_truncate_page(inode, offset, 0, 0);
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	/* zero the front end of the last page */
+	ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	if (lockend < lockstart) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len < lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, EXTENT_UPTODATE, 0,
+				     cached_state)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_NOFS);
+		btrfs_wait_ordered_range(inode, lockstart,
+					 lockend - lockstart + 1);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	while (cur_offset < lockend) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, lockend + 1,
+					   &drop_end, 1);
+		if (ret != -ENOSPC)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+
+		trans = btrfs_start_transaction(root, 3);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+	}
+
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+out_free:
+	btrfs_free_path(path);
+	btrfs_free_block_rsv(root, rsv);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+	mutex_unlock(&inode->i_mutex);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -1663,10 +1984,13 @@ static long btrfs_fallocate(struct file *file, int mode,
 	alloc_start = offset & ~mask;
 	alloc_end =  (offset + len + mask) & ~mask;
 
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	/* Make sure we aren't being give some crap mode */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return btrfs_punch_hole(inode, offset, len);
+
 	/*
 	 * Make sure we have enough space before we do the
 	 * allocation.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f0a4792..8cb272c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3475,12 +3475,20 @@ error:
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *	offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front)
 {
-	struct inode *inode = mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -3495,7 +3503,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	u64 page_start;
 	u64 page_end;
 
-	if ((offset & (blocksize - 1)) == 0)
+	if ((offset & (blocksize - 1)) == 0 &&
+	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
@@ -3555,8 +3564,13 @@ again:
 
 	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
+		if (!len)
+			len = PAGE_CACHE_SIZE - offset;
 		kaddr = kmap(page);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		if (front)
+			memset(kaddr, 0, offset);
+		else
+			memset(kaddr + offset, 0, len);
 		flush_dcache_page(page);
 		kunmap(page);
 	}
@@ -6796,7 +6810,7 @@ static int btrfs_truncate(struct inode *inode)
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0c39f58..f9b0fc9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2842,7 +2842,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 
 	if (BTRFS_I(inode)->logged_trans == trans->transid) {
 		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-					   start + len, 0);
+					   start + len, NULL, 0);
 		if (ret)
 			return ret;
 	}
-- 
cgit v0.10.2


From 6fc4e3548598d10a5e947797a09cbc1b257a22ab Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:15 -0600
Subject: Btrfs: pass lockdep rwsem metadata to async commit transaction

The freeze rwsem is taken by sb_start_intwrite() and dropped during the
commit_ or end_transaction().  In the async case, that happens in a worker
thread.  Tell lockdep the calling thread is releasing ownership of the
rwsem and the async thread is picking it up.

XFS plays the same trick in fs/xfs/xfs_aops.c.

Signed-off-by: Sage Weil <sage@inktank.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c2600..730c945 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1229,6 +1229,14 @@ static void do_async_commit(struct work_struct *work)
 	struct btrfs_async_commit *ac =
 		container_of(work, struct btrfs_async_commit, work.work);
 
+	/*
+	 * We've got freeze protection passed with the transaction.
+	 * Tell lockdep about it.
+	 */
+	rwsem_acquire_read(
+		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		0, 1, _THIS_IP_);
+
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
 }
@@ -1258,6 +1266,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	atomic_inc(&cur_trans->use_count);
 
 	btrfs_end_transaction(trans, root);
+
+	/*
+	 * Tell lockdep we've released the freeze rwsem, since the
+	 * async commit thread will be the one to unlock it.
+	 */
+	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		      1, _THIS_IP_);
+
 	schedule_delayed_work(&ac->work, 0);
 
 	/* wait for transaction to start and unblock */
-- 
cgit v0.10.2


From e209db7ace281ca347b1ac699bf1fb222eac03fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:16 -0600
Subject: Btrfs: set journal_info in async trans commit worker

We expect current->journal_info to point to the trans handle we are
committing.

Signed-off-by: Sage Weil <sage@inktank.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 730c945..ffc6b520 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1237,6 +1237,8 @@ static void do_async_commit(struct work_struct *work)
 		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
 		0, 1, _THIS_IP_);
 
+	current->journal_info = ac->newtrans;
+
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
 }
-- 
cgit v0.10.2


From ac14aed66558d686b1f95dac1f07ecfe11d8c30e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:17 -0600
Subject: Btrfs: do not take cleanup_work_sem in btrfs_run_delayed_iputs()

Josef has suggested that this is not necessary.  Removing it also avoids
this lockdep splat (after the new sb_internal locking stuff was added):

[  604.090449] ======================================================
[  604.114819] [ INFO: possible circular locking dependency detected ]
[  604.139262] 3.6.0-rc2-ceph-00144-g463b030 #1 Not tainted
[  604.162193] -------------------------------------------------------
[  604.186139] btrfs-cleaner/6669 is trying to acquire lock:
[  604.209555]  (sb_internal#2){.+.+..}, at: [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  604.257100]
[  604.257100] but task is already holding lock:
[  604.300366]  (&fs_info->cleanup_work_sem){.+.+..}, at: [<ffffffffa0048002>] btrfs_run_delayed_iputs+0x72/0x130 [btrfs]
[  604.352989]
[  604.352989] which lock already depends on the new lock.
[  604.352989]
[  604.427104]
[  604.427104] the existing dependency chain (in reverse order) is:
[  604.478493]
[  604.478493] -> #1 (&fs_info->cleanup_work_sem){.+.+..}:
[  604.529313]        [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  604.559621]        [<ffffffff81632b69>] down_read+0x39/0x4e
[  604.589382]        [<ffffffffa004db98>] btrfs_lookup_dentry+0x218/0x550 [btrfs]
[  604.596161] btrfs: unlinked 1 orphans
[  604.675002]        [<ffffffffa006aadd>] create_subvol+0x62d/0x690 [btrfs]
[  604.708859]        [<ffffffffa006d666>] btrfs_mksubvol.isra.52+0x346/0x3a0 [btrfs]
[  604.772466]        [<ffffffffa006d7f2>] btrfs_ioctl_snap_create_transid+0x132/0x190 [btrfs]
[  604.842245]        [<ffffffffa006d8ae>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs]
[  604.912852]        [<ffffffffa00708ae>] btrfs_ioctl+0x138e/0x1990 [btrfs]
[  604.951888]        [<ffffffff8118e9b8>] do_vfs_ioctl+0x98/0x560
[  604.989961]        [<ffffffff8118ef11>] sys_ioctl+0x91/0xa0
[  605.026628]        [<ffffffff8163d569>] system_call_fastpath+0x16/0x1b
[  605.064404]
[  605.064404] -> #0 (sb_internal#2){.+.+..}:
[  605.126832]        [<ffffffff810b25e8>] __lock_acquire+0x1ac8/0x1b90
[  605.163671]        [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  605.200228]        [<ffffffff8117dac6>] __sb_start_write+0xc6/0x1b0
[  605.236818]        [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  605.274029]        [<ffffffffa00431a3>] btrfs_start_transaction+0x13/0x20 [btrfs]
[  605.340520]        [<ffffffffa004ccfa>] btrfs_evict_inode+0x19a/0x330 [btrfs]
[  605.378720]        [<ffffffff811972c8>] evict+0xb8/0x1c0
[  605.416057]        [<ffffffff811974d5>] iput+0x105/0x210
[  605.452373]        [<ffffffffa0048082>] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs]
[  605.521627]        [<ffffffffa003b5e1>] cleaner_kthread+0xa1/0x120 [btrfs]
[  605.560520]        [<ffffffff810791ee>] kthread+0xae/0xc0
[  605.598094]        [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
[  605.636499]
[  605.636499] other info that might help us debug this:
[  605.636499]
[  605.736504]  Possible unsafe locking scenario:
[  605.736504]
[  605.801931]        CPU0                    CPU1
[  605.835126]        ----                    ----
[  605.867093]   lock(&fs_info->cleanup_work_sem);
[  605.898594]                                lock(sb_internal#2);
[  605.931954]                                lock(&fs_info->cleanup_work_sem);
[  605.965359]   lock(sb_internal#2);
[  605.994758]
[  605.994758]  *** DEADLOCK ***
[  605.994758]
[  606.075281] 2 locks held by btrfs-cleaner/6669:
[  606.104528]  #0:  (&fs_info->cleaner_mutex){+.+...}, at: [<ffffffffa003b5d5>] cleaner_kthread+0x95/0x120 [btrfs]
[  606.165626]  #1:  (&fs_info->cleanup_work_sem){.+.+..}, at: [<ffffffffa0048002>] btrfs_run_delayed_iputs+0x72/0x130 [btrfs]
[  606.231297]
[  606.231297] stack backtrace:
[  606.287723] Pid: 6669, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00144-g463b030 #1
[  606.347823] Call Trace:
[  606.376184]  [<ffffffff8162a77c>] print_circular_bug+0x1fb/0x20c
[  606.409243]  [<ffffffff810b25e8>] __lock_acquire+0x1ac8/0x1b90
[  606.441343]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.474583]  [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  606.505934]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.539429]  [<ffffffff8132babd>] ? do_raw_spin_unlock+0x5d/0xb0
[  606.571719]  [<ffffffff8117dac6>] __sb_start_write+0xc6/0x1b0
[  606.603498]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.637405]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.670165]  [<ffffffff81172e75>] ? kmem_cache_alloc+0xb5/0x160
[  606.702144]  [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  606.735562]  [<ffffffffa00256a6>] ? block_rsv_add_bytes+0x56/0x80 [btrfs]
[  606.769861]  [<ffffffffa00431a3>] btrfs_start_transaction+0x13/0x20 [btrfs]
[  606.804575]  [<ffffffffa004ccfa>] btrfs_evict_inode+0x19a/0x330 [btrfs]
[  606.838756]  [<ffffffff81634c6b>] ? _raw_spin_unlock+0x2b/0x40
[  606.872010]  [<ffffffff811972c8>] evict+0xb8/0x1c0
[  606.903800]  [<ffffffff811974d5>] iput+0x105/0x210
[  606.935416]  [<ffffffffa0048082>] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs]
[  606.970510]  [<ffffffffa003b5d5>] ? cleaner_kthread+0x95/0x120 [btrfs]
[  607.005648]  [<ffffffffa003b5e1>] cleaner_kthread+0xa1/0x120 [btrfs]
[  607.040724]  [<ffffffffa003b540>] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs]
[  607.104740]  [<ffffffff810791ee>] kthread+0xae/0xc0
[  607.137119]  [<ffffffff810b379d>] ? trace_hardirqs_on+0xd/0x10
[  607.169797]  [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
[  607.202472]  [<ffffffff81635430>] ? retint_restore_args+0x13/0x13
[  607.235884]  [<ffffffff81079140>] ? flush_kthread_work+0x1a0/0x1a0
[  607.268731]  [<ffffffff8163e740>] ? gs_change+0x13/0x13

Signed-off-by: Sage Weil <sage@inktank.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cb272c..073af07 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2118,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	if (empty)
 		return;
 
-	down_read(&root->fs_info->cleanup_work_sem);
 	spin_lock(&fs_info->delayed_iput_lock);
 	list_splice_init(&fs_info->delayed_iputs, &list);
 	spin_unlock(&fs_info->delayed_iput_lock);
@@ -2129,7 +2128,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 		iput(delayed->inode);
 		kfree(delayed);
 	}
-	up_read(&root->fs_info->cleanup_work_sem);
 }
 
 enum btrfs_orphan_cleanup_state {
-- 
cgit v0.10.2


From 7014cdb49305eda0767d2ae6136f8c191ea8fd81 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 30 Aug 2012 20:06:49 -0400
Subject: Btrfs: btrfs_drop_extent_cache should never fail

I noticed this when I was doing the fsync stuff, we allocate split extents if we
drop an extent range that is in the middle of an existing extent.  This BUG()'s
if we fail to allocate memory, but the fact is this is just a cache, we will
just regenerate the cache if we need it, the important part is that we free the
range we are given.  This can be done without allocations, so if we fail to
allocate splits just skip the splitting stage and free our em and look for more
extents to drop.  This also makes btrfs_drop_extent_cache a void since nobody
was checking the return value anyway.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 88adfe6..b7cd3ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3316,8 +3316,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned);
 int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
 			       u64 start, u64 end, int skip_pinned,
 			       int modified);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 57026a6..a50e987 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -459,8 +459,8 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			       int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
@@ -479,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		testend = 0;
 	}
 	while (1) {
+		int no_splits = 0;
+
 		if (!split)
 			split = alloc_extent_map();
 		if (!split2)
 			split2 = alloc_extent_map();
-		BUG_ON(!split || !split2); /* -ENOMEM */
+		if (!split || !split2)
+			no_splits = 1;
 
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
@@ -509,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
+		if (no_splits)
+			goto next;
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 		    em->start < start) {
@@ -559,6 +564,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(split);
 			split = NULL;
 		}
+next:
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -570,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		free_extent_map(split);
 	if (split2)
 		free_extent_map(split2);
-	return 0;
 }
 
 /*
-- 
cgit v0.10.2


From cf93dccea67ad8f5e0d9163c6a0a584550bbd7cd Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Sun, 2 Sep 2012 07:44:51 -0600
Subject: Btrfs: fix possible memory leak in scrub_setup_recheck_block()

bbio has been malloced in btrfs_map_block() and should be
freed before leaving from the error handling cases.

spatch with a semantic match is used to found this problem.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620..4e9eafe 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1029,6 +1029,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 				spin_lock(&sdev->stat_lock);
 				sdev->stat.malloc_errors++;
 				spin_unlock(&sdev->stat_lock);
+				kfree(bbio);
 				return -ENOMEM;
 			}
 			sblock->page_count++;
-- 
cgit v0.10.2


From 6fa9700e734275de2acbcb0e99414bd7ddfc60f1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:00:32 -0600
Subject: Btrfs: fix error path in create_pending_snapshot()

This patch fixes the following problem:
- If we failed to deal with the delayed dir items, we should abort transaction,
  just as its comment said. Fix it.
- If root reference or root back reference insertion failed, we should
  abort transaction. Fix it.
- Fix the double free problem of pending->inherit.
- Do not restore the trans->rsv if we doesn't change it.
- make the error path more clearly.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ffc6b520..7be0318 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -967,18 +967,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	uuid_le new_uuid;
 
-	rsv = trans->block_rsv;
-
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		ret = pending->error = -ENOMEM;
-		goto fail;
+		goto root_item_alloc_fail;
 	}
 
 	ret = btrfs_find_free_objectid(tree_root, &objectid);
 	if (ret) {
 		pending->error = ret;
-		goto fail;
+		goto no_free_objectid;
 	}
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +986,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 						  to_reserve);
 		if (ret) {
 			pending->error = ret;
-			goto fail;
+			goto no_free_objectid;
 		}
 	}
 
 	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
 				   objectid, pending->inherit);
-	kfree(pending->inherit);
 	if (ret) {
 		pending->error = ret;
-		goto fail;
+		goto no_free_objectid;
 	}
 
 	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 
+	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
 
 	dentry = pending->dentry;
@@ -1023,10 +1021,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				BTRFS_FT_DIR, index);
 	if (ret == -EEXIST) {
 		pending->error = -EEXIST;
-		dput(parent);
 		goto fail;
 	} else if (ret) {
-		goto abort_trans_dput;
+		goto abort_trans;
 	}
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
@@ -1034,7 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/*
 	 * pull in the delayed directory update
@@ -1043,10 +1040,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret) { /* Transaction aborted */
-		dput(parent);
-		goto fail;
-	}
+	if (ret)	/* Transaction aborted */
+		goto abort_trans;
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -1079,7 +1074,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret) {
 		btrfs_tree_unlock(old);
 		free_extent_buffer(old);
-		goto abort_trans_dput;
+		goto abort_trans;
 	}
 
 	btrfs_set_lock_blocking(old);
@@ -1089,7 +1084,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
@@ -1102,7 +1097,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/*
 	 * insert root back/forward references
@@ -1111,9 +1106,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	dput(parent);
 	if (ret)
-		goto fail;
+		goto abort_trans;
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
@@ -1125,15 +1119,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_reloc_post_snapshot(trans, pending);
 	if (ret)
 		goto abort_trans;
-	ret = 0;
 fail:
-	kfree(new_root_item);
+	dput(parent);
 	trans->block_rsv = rsv;
+no_free_objectid:
+	kfree(new_root_item);
+root_item_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
 
-abort_trans_dput:
-	dput(parent);
 abort_trans:
 	btrfs_abort_transaction(trans, root, ret);
 	goto fail;
-- 
cgit v0.10.2


From 361048f586f59d414421c6486dd846063a0cac98 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:00:57 -0600
Subject: Btrfs: fix full backref problem when inserting shared block reference

If we create several snapshots at the same time, the following BUG_ON() will be
triggered.

	kernel BUG at fs/btrfs/extent-tree.c:6047!

Steps to reproduce:
 # mkfs.btrfs <partition>
 # mount <partition> <mnt>
 # cd <mnt>
 # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
 # for ((i=0; i<4; i++))
 > do
 > mkdir $i
 > for ((j=0; j<200; j++))
 > do
 > btrfs sub snap . $i/$j
 > done &
 > done

The reason is:
Before transaction commit, some operations changed the fs tree and new tree
blocks were allocated because of COW. We used the implicit non-shared back
reference for those newly allocated tree blocks because they were not shared by
two or more trees.

And then we created the first snapshot for the fs tree, according to the back
reference rules, we also used implicit back refs for the child tree blocks of
the root node of the fs tree, now those child nodes/leaves were shared by two
trees.

Then We didn't deal with the delayed references, and continued to change the fs
tree(created the second snapshot and inserted the dir item of the new snapshot
into the fs tree). According to the rules of the back reference, we added full
back refs for those tree blocks whose parents have be shared by two trees.
Now some newly allocated tree blocks had two types of the references.

As we know, the delayed reference system handles these delayed references from
back to front, and the full delayed reference is inserted after the implicit
ones. So when we dealt with the back references of those newly allocated tree
blocks, the full references was dealt with at first. And if the first reference
is a shared back reference and the tree block that the reference points to is
newly allocated, It would be considered as a tree block which is shared by two
or more trees when it is allocated and should be a full back reference not a
implicit one, the flag of its reference also should be set to FULL_BACKREF.
But in fact, it was a non-shared tree block with a implicit reference at
beginning, so it was not compulsory to set the flags to FULL_BACKREF. So BUG_ON
was triggered.

We have several methods to fix this bug:
1. deal with delayed references after the snapshot is created and before we
   change the source tree of the snapshot. This is the easiest and safest way.
2. modify the sort method of the delayed reference tree, make the full delayed
   references be inserted before the implicit ones. It is also very easy, but
   I don't know if it will introduce some problems or not.
3. modify select_delayed_ref() and make it select the implicit delayed reference
   at first. This way is not so good because it may wastes CPU time if we have
   lots of delayed references.
4. set the flags to FULL_BACKREF, this method is a little complex comparing with
   the 1st way.

I chose the 1st way to fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7be0318..4b6ce5c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1119,6 +1119,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_reloc_post_snapshot(trans, pending);
 	if (ret)
 		goto abort_trans;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		goto abort_trans;
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;
-- 
cgit v0.10.2


From b9a8cc5bef963b76c5b6c3016b7e91988a3e758b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:01:21 -0600
Subject: Btrfs: fix file extent discount problem in the, snapshot

If a snapshot is created while we are writing some data into the file,
the i_size of the corresponding file in the snapshot will be wrong, it will
be beyond the end of the last file extent. And btrfsck will report:
  root 256 inode 257 errors 100

Steps to reproduce:
 # mkfs.btrfs <partition>
 # mount <partition> <mnt>
 # cd <mnt>
 # dd if=/dev/zero of=tmpfile bs=4M count=1024 &
 # for ((i=0; i<4; i++))
 > do
 > btrfs sub snap . $i
 > done

This because the algorithm of disk_i_size update is wrong. Though there are
some ordered extents behind the current one which we use to update disk_i_size,
it doesn't mean those extents will be dealt with in the same transaction. So
We shouldn't use the offset of those extents to update disk_i_size. Or we will
get the wrong i_size in the snapshot.

We fix this problem by recording the max real i_size. If we find there is a
ordered extent which is in front of the current one and doesn't complete, we
will record the end of the current one into that ordered extent. Surely, if
the current extent holds the end of other extent(it must be greater than
the current one because it is behind the current one), we will record the
number that the current extent holds. In this way, we can exclude the ordered
extents that may not be dealth with in the same transaction, and be easy to
know the real disk_i_size.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe..cd8ecb7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -775,7 +775,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	u64 disk_i_size;
 	u64 new_i_size;
-	u64 i_size_test;
 	u64 i_size = i_size_read(inode);
 	struct rb_node *node;
 	struct rb_node *prev = NULL;
@@ -835,55 +834,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 			break;
 		if (test->file_offset >= i_size)
 			break;
-		if (test->file_offset >= disk_i_size)
+		if (test->file_offset >= disk_i_size) {
+			/*
+			 * we don't update disk_i_size now, so record this
+			 * undealt i_size. Or we will not know the real
+			 * i_size.
+			 */
+			if (test->outstanding_isize < offset)
+				test->outstanding_isize = offset;
+			if (ordered &&
+			    ordered->outstanding_isize >
+			    test->outstanding_isize)
+				test->outstanding_isize =
+						ordered->outstanding_isize;
 			goto out;
-	}
-	new_i_size = min_t(u64, offset, i_size);
-
-	/*
-	 * at this point, we know we can safely update i_size to at least
-	 * the offset from this ordered extent.  But, we need to
-	 * walk forward and see if ios from higher up in the file have
-	 * finished.
-	 */
-	if (ordered) {
-		node = rb_next(&ordered->rb_node);
-	} else {
-		if (prev)
-			node = rb_next(prev);
-		else
-			node = rb_first(&tree->tree);
-	}
-
-	/*
-	 * We are looking for an area between our current extent and the next
-	 * ordered extent to update the i_size to.  There are 3 cases here
-	 *
-	 * 1) We don't actually have anything and we can update to i_size.
-	 * 2) We have stuff but they already did their i_size update so again we
-	 * can just update to i_size.
-	 * 3) We have an outstanding ordered extent so the most we can update
-	 * our disk_i_size to is the start of the next offset.
-	 */
-	i_size_test = i_size;
-	for (; node; node = rb_next(node)) {
-		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
-		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
-			continue;
-		if (test->file_offset > offset) {
-			i_size_test = test->file_offset;
-			break;
 		}
 	}
+	new_i_size = min_t(u64, offset, i_size);
 
 	/*
-	 * i_size_test is the end of a region after this ordered
-	 * extent where there are no ordered extents, we can safely set
-	 * disk_i_size to this.
+	 * Some ordered extents may completed before the current one, and
+	 * we hold the real i_size in ->outstanding_isize.
 	 */
-	if (i_size_test > offset)
-		new_i_size = min_t(u64, i_size_test, i_size);
+	if (ordered && ordered->outstanding_isize > new_i_size)
+		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 	BTRFS_I(inode)->disk_i_size = new_i_size;
 	ret = 0;
 out:
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560..c2443a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
+	/*
+	 * the end of the ordered extent which is behind it but
+	 * didn't update disk_i_size. Please see the comment of
+	 * btrfs_ordered_update_i_size();
+	 */
+	u64 outstanding_isize;
+
 	/* flags (described above) */
 	unsigned long flags;
 
-- 
cgit v0.10.2


From 6352b91da1a2108bb8cc5115e8714f90d706f15f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:01:51 -0600
Subject: Btrfs: use a slab for ordered extents allocation

The ordered extent allocation is in the fast path of the IO, so use a slab
to improve the speed of the allocation.

 "Size of the struct is 280, so this will fall into the size-512 bucket,
  giving 8 objects per page, while own slab will pack 14 objects into a page.

  Another benefit I see is to check for leaked objects when the module is
  removed (and the cache destroy takes place)."
						-- David Sterba

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index cd8ecb7..e2b3d99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
 	if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	struct btrfs_ordered_extent *entry;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 			list_del(&sum->list);
 			kfree(sum);
 		}
-		kfree(entry);
+		kmem_cache_free(btrfs_ordered_extent_cache, entry);
 	}
 }
 
@@ -958,3 +960,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&root->fs_info->ordered_extent_lock);
 }
+
+int __init ordered_data_init(void)
+{
+	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+				     sizeof(struct btrfs_ordered_extent), 0,
+				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				     NULL);
+	if (!btrfs_ordered_extent_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ordered_data_exit(void)
+{
+	if (btrfs_ordered_extent_cache)
+		kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c2443a4..d1ddaef 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -192,4 +192,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root,
 				int nocow_only, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f..06ff1dd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1620,10 +1620,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_extent_io;
 
-	err = btrfs_delayed_inode_init();
+	err = ordered_data_init();
 	if (err)
 		goto free_extent_map;
 
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_ordered_data;
+
 	err = btrfs_interface_init();
 	if (err)
 		goto free_delayed_inode;
@@ -1641,6 +1645,8 @@ unregister_ioctl:
 	btrfs_interface_exit();
 free_delayed_inode:
 	btrfs_delayed_inode_exit();
+free_ordered_data:
+	ordered_data_exit();
 free_extent_map:
 	extent_map_exit();
 free_extent_io:
@@ -1657,6 +1663,7 @@ static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
 	btrfs_delayed_inode_exit();
+	ordered_data_exit();
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
-- 
cgit v0.10.2


From 66d8f3dd1c87813d7f1cf8b774cb03e9b8d7e87e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:02:28 -0600
Subject: Btrfs: add a new "type" field into the block reservation structure

Sometimes we need choose the method of the reservation according to the type
of the block reservation, such as the reservation for the delayed inode update.
Now we identify the type just by comparing the address of the reservation
variants, it is very ugly if it is a temporary one because we need compare it
with all the common reservation variants. So we add a new "type" field to keep
the type the reservation variants.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7cd3ad..2990a7e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1028,13 +1028,22 @@ struct btrfs_space_info {
 	wait_queue_head_t wait;
 };
 
+#define	BTRFS_BLOCK_RSV_GLOBAL		1
+#define	BTRFS_BLOCK_RSV_DELALLOC	2
+#define	BTRFS_BLOCK_RSV_TRANS		3
+#define	BTRFS_BLOCK_RSV_CHUNK		4
+#define	BTRFS_BLOCK_RSV_DELOPS		5
+#define	BTRFS_BLOCK_RSV_EMPTY		6
+#define	BTRFS_BLOCK_RSV_TEMP		7
+
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
-	unsigned int full;
-	unsigned int failfast;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
 };
 
 /*
@@ -2875,8 +2884,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 07d5eeb..eb768c4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 * we're accounted for.
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
-	    src_rsv != &root->fs_info->delalloc_block_rsv)) {
+			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
 		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
 		/*
 		 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 						      num_bytes, 1);
 		}
 		return ret;
-	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
 		spin_lock(&BTRFS_I(inode)->lock);
 		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
 				       &BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e0..0dcfb99 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2014,12 +2014,15 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	btrfs_init_block_rsv(&fs_info->global_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ee51b7a..36e0331 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4108,13 +4108,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
 	return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
+	rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4123,7 +4125,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv);
+	btrfs_init_block_rsv(block_rsv, type);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
 	return block_rsv;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a50e987..a9d7815c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1874,7 +1874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out;
 	}
 
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		ret = -ENOMEM;
 		goto out_free;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 073af07..d34eb32 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2195,7 +2195,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	int ret;
 
 	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(root);
+		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 		if (!block_rsv)
 			return -ENOMEM;
 	}
@@ -3070,7 +3070,7 @@ out:
 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
-	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+	if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
 		btrfs_block_rsv_release(root, trans->block_rsv,
 					trans->bytes_reserved);
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3821,7 +3821,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
@@ -6851,7 +6851,7 @@ static int btrfs_truncate(struct inode *inode)
 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
 	 * updating the inode.
 	 */
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
 		return -ENOMEM;
 	rsv->size = min_size;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5543fd5..e6934de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -516,7 +516,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!pending_snapshot)
 		return -ENOMEM;
 
-	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+			     BTRFS_BLOCK_RSV_TEMP);
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da0865..5a15c96 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+					      BTRFS_BLOCK_RSV_TEMP);
 	if (!rc->block_rsv)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 42874b3db7817f662b1d7c6e32f8b63638fa0321 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:03:32 -0600
Subject: Btrfs: fix the snapshot that should not exist

The snapshot should be the image of the fs tree before it was created,
so the metadata of the snapshot should not exist in the its tree. But now, we
found the directory item and directory name index is in both the snapshot tree
and the fs tree. It introduces some problems and makes the users feel strange:

 # mkfs.btrfs /dev/sda1
 # mount /dev/sda1 /mnt
 # mkdir /mnt/1
 # cd /mnt/1
 # btrfs subvolume snapshot /mnt snap0
 # ls -a /mnt/1/snap0/1
 .	..	[no other file/dir]

 # ll /mnt/1/snap0/
 total 0
 drwxr-xr-x 1 root root 10 Ju1 24 12:11 1
			^^^
			There is no file/dir in it, but it's size is 10

 # cd /mnt/1/snap0/1/snap0
 [Enter a unexisted directory successfully...]

There is nothing in the directory 1 in snap0, but btrfs told the length of
this directory is 10. Beside that, we can enter an unexisted directory, it is
very strange to the users.

 # btrfs subvolume snapshot /mnt/1/snap0 /mnt/snap1
 # ll /mnt/1/snap0/1/
 total 0
 [None]
 # ll /mnt/snap1/1/
 total 0
 drwxr-xr-x 1 root root 0 Ju1 24 12:14 snap0

And the source of snap1 did have any directory in Directory 1, but snap1 have
a snap0, it is different between the source and the snapshot.

So I think we should insert directory item and directory name index and update
the parent inode as the last step of snapshot creation, and do not leave the
useless metadata in the file tree.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4b6ce5c..a86fc72 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -955,6 +955,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *parent_root;
 	struct btrfs_block_rsv *rsv;
 	struct inode *parent_inode;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
 	struct dentry *parent;
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
@@ -967,6 +969,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	uuid_le new_uuid;
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = pending->error = -ENOMEM;
+		goto path_alloc_fail;
+	}
+
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		ret = pending->error = -ENOMEM;
@@ -1015,23 +1023,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 */
 	ret = btrfs_set_inode_index(parent_inode, &index);
 	BUG_ON(ret); /* -ENOMEM */
-	ret = btrfs_insert_dir_item(trans, parent_root,
-				dentry->d_name.name, dentry->d_name.len,
-				parent_inode, &key,
-				BTRFS_FT_DIR, index);
-	if (ret == -EEXIST) {
+
+	/* check if there is a file/dir which has the same name. */
+	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+					 btrfs_ino(parent_inode),
+					 dentry->d_name.name,
+					 dentry->d_name.len, 0);
+	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
 		goto fail;
-	} else if (ret) {
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
 		goto abort_trans;
 	}
-
-	btrfs_i_size_write(parent_inode, parent_inode->i_size +
-					 dentry->d_name.len * 2);
-	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
-	if (ret)
-		goto abort_trans;
+	btrfs_release_path(path);
 
 	/*
 	 * pull in the delayed directory update
@@ -1123,12 +1128,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	if (ret)
 		goto abort_trans;
+
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    parent_inode, &key,
+				    BTRFS_FT_DIR, index);
+	/* We have check then name at the beginning, so it is impossible. */
+	BUG_ON(ret == -EEXIST);
+	if (ret)
+		goto abort_trans;
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	if (ret)
+		goto abort_trans;
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;
 no_free_objectid:
 	kfree(new_root_item);
 root_item_alloc_fail:
+	btrfs_free_path(path);
+path_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
 
@@ -1472,13 +1495,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	mutex_lock(&root->fs_info->reloc_mutex);
 
-	ret = btrfs_run_delayed_items(trans, root);
+	/*
+	 * We needn't worry about the delayed items because we will
+	 * deal with them in create_pending_snapshot(), which is the
+	 * core function of the snapshot creation.
+	 */
+	ret = create_pending_snapshots(trans, root->fs_info);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;
 	}
 
-	ret = create_pending_snapshots(trans, root->fs_info);
+	/*
+	 * We insert the dir indexes of the snapshots and update the inode
+	 * of the snapshots' parents after the snapshot creation, so there
+	 * are some delayed items which are not dealt with. Now deal with
+	 * them.
+	 *
+	 * We needn't worry that this operation will corrupt the snapshots,
+	 * because all the tree which are snapshoted will be forced to COW
+	 * the nodes and leaves.
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;
-- 
cgit v0.10.2


From 48c03c4bcfd7a1fcb1e05e9b1db1188cdbecf49a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:03:56 -0600
Subject: Btrfs: fix wrong size for the reservation of the, snapshot creation

We should insert/update 6 items(root ref, root backref, dir item, dir index,
root item and parent inode) when creating a snapshot, not 5 items, fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 36e0331..b6b33e4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4414,10 +4414,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
 	/*
-	 * two for root back/forward refs, two for directory entries
-	 * and one for root of the snapshot.
+	 * two for root back/forward refs, two for directory entries,
+	 * one for root of the snapshot and one for parent inode.
 	 */
-	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
 	dst_rsv->space_info = src_rsv->space_info;
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e6934de..c56daa3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -526,7 +526,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		*inherit = NULL;	/* take responsibility to free it */
 	}
 
-	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;
-- 
cgit v0.10.2


From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:27 -0600
Subject: Btrfs: fix unprotected ->log_batch

We forget to protect ->log_batch when syncing a file, this patch fix
this problem by atomic operation. And ->log_batch is used to check
if there are parallel sync operations or not, so it is unnecessary to
reset it to 0 after the sync operation of the current log tree complete.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2990a7e..6923b9e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,9 +1491,9 @@ struct btrfs_root {
 	wait_queue_head_t log_commit_wait[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
+	atomic_t log_batch;
 	unsigned long log_transid;
 	unsigned long last_log_commit;
-	unsigned long log_batch;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0dcfb99..34717ac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1168,8 +1168,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
+	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
-	root->log_batch = 0;
 	root->log_transid = 0;
 	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a9d7815c..793bc89 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1551,9 +1551,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * ordered range does a filemape_write_and_wait_range which is why we
 	 * don't do it above like other file systems.
 	 */
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	btrfs_wait_ordered_range(inode, start, end);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 
 	/*
 	 * check the transaction that last modified this inode
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f9b0fc9..038a522 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -147,7 +147,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			root->log_multiple_pids = true;
 		}
 
-		root->log_batch++;
+		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
 		mutex_unlock(&root->log_mutex);
 		return 0;
@@ -166,7 +166,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			err = ret;
 	}
 	mutex_unlock(&root->fs_info->tree_log_mutex);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
 	return err;
@@ -2036,7 +2036,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 	while (1) {
-		unsigned long batch = root->log_batch;
+		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
@@ -2044,7 +2044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->log_mutex);
 		}
 		wait_for_writer(trans, root);
-		if (batch == root->log_batch)
+		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
 
@@ -2073,7 +2073,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
-	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
@@ -2086,7 +2085,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 	mutex_lock(&log_root_tree->log_mutex);
-	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
 	mutex_unlock(&log_root_tree->log_mutex);
 
@@ -2156,7 +2155,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
-	log_root_tree->log_batch = 0;
 	log_root_tree->log_transid++;
 	smp_mb();
 
-- 
cgit v0.10.2


From 69ce977a179750915e04fcc12bfbe33e6c8f5132 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:44 -0600
Subject: Btrfs: output more information when aborting a unused transaction
 handle

Though we dump the stack information when aborting a unused transaction
handle, we don't know the correct place where we decide to abort the
transaction handle if one function has several place where the transaction
abort function is invoked and jumps to the same place after this call.
And beside that we also don't know the reason why we jump to abort
the current handle. So I modify the transaction abort function and make
it output the function name, line and error information.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 06ff1dd..867e8e7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -248,7 +248,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
 	if (!trans->blocks_used) {
-		btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+		char nbuf[16];
+		const char *errstr;
+
+		errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+		btrfs_printk(root->fs_info,
+			     "%s:%d: Aborting unused transaction(%s).\n",
+			     function, line, errstr);
 		return;
 	}
 	trans->transaction->aborted = errno;
-- 
cgit v0.10.2


From 903889f462409c816893abd02d88636f7b4a7774 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:57 -0600
Subject: Btrfs: fix wrong size for the reservation when doing, file
 pre-allocation.

When we ran fsstress(a program in xfstests), the filesystem hung up when it
is full. It was because the space reserved in btrfs_fallocate() was wrong,
btrfs_fallocate() just used the size of the pre-allocation to reserve the
space, didn't took the block size aligning into account, so the size of
the reserved space was less than the allocated space, it caused the over
reserve problem and made the filesystem hung up when invoking cow_file_range().
Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 793bc89..00279c9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2000,7 +2000,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * Make sure we have enough space before we do the
 	 * allocation.
 	 */
-	ret = btrfs_check_data_free_space(inode, len);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
 	if (ret)
 		return ret;
 
@@ -2107,7 +2107,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	btrfs_free_reserved_data_space(inode, len);
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
 	return ret;
 }
 
-- 
cgit v0.10.2


From f54fb859da53f04a443c5e3f4cb9b936ed42d227 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Sep 2012 03:08:59 -0600
Subject: Btrfs: fix error handling in delete_block_group_cache()

btrfs_iget() never return NULL.
So, NULL check is unnecessary.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5a15c96..7e7fd1b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	key.offset = 0;
 
 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-	if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
-		if (inode && !IS_ERR(inode))
+	if (IS_ERR(inode) || is_bad_inode(inode)) {
+		if (!IS_ERR(inode))
 			iput(inode);
 		return -ENOENT;
 	}
-- 
cgit v0.10.2


From 3d6b5c3b5c0b970ce8a9d3bac6854f5c0ce0295a Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Sep 2012 00:18:10 -0600
Subject: Btrfs: check return value of ulist_alloc() properly

ulist_alloc() has the possibility of returning NULL.
So, it is necessary to check the return value.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9b707ba..5039686 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1469,6 +1469,10 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
+	if (!ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
@@ -1541,6 +1545,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
+	if (!ulist) {
+		btrfs_std_error(fs_info, -ENOMEM);
+		goto out;
+	}
 	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
-- 
cgit v0.10.2


From 9e8a4a8b0b9484e8d14674fc62c9ad8ac9dbce5b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 5 Sep 2012 19:10:51 -0600
Subject: Btrfs: use flag EXTENT_DEFRAG for snapshot-aware defrag

We're going to use this flag EXTENT_DEFRAG to indicate which range
belongs to defragment so that we can implement snapshow-aware defrag:

We set the EXTENT_DEFRAG flag when dirtying the extents that need
defragmented, so later on writeback thread can differentiate between
normal writeback and writeback started by defragmentation.

Original-Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 19319f5..4a41b17 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1144,6 +1144,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, cached_state, mask);
 }
 
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
+}
+
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af..512f8da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -235,6 +235,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 00279c9..0a4b03d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1203,8 +1203,8 @@ again:
 
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-				  GFP_NOFS);
+				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				  0, 0, &cached_state, GFP_NOFS);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 				     start_pos, last_pos - 1, &cached_state,
 				     GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d34eb32..9b21cf9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3549,7 +3549,8 @@ again:
 	}
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6061,7 +6062,8 @@ unlock:
 	if (lockstart < lockend) {
 		if (create && len < lockend - lockstart) {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockstart + len - 1, unlock_bits, 1, 0,
+					 lockstart + len - 1,
+					 unlock_bits | EXTENT_DEFRAG, 1, 0,
 					 &cached_state, GFP_NOFS);
 			/*
 			 * Beside unlock, we also need to cleanup reserved space
@@ -6069,8 +6071,8 @@ unlock:
 			 */
 			clear_extent_bit(&BTRFS_I(inode)->io_tree,
 					 lockstart + len, lockend,
-					 unlock_bits | EXTENT_DO_ACCOUNTING,
-					 1, 0, NULL, GFP_NOFS);
+					 unlock_bits | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
 		} else {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
 					 lockend, unlock_bits, 1, 0,
@@ -6635,8 +6637,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 &cached_state, GFP_NOFS);
+				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -6652,7 +6654,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+		 &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -6749,7 +6752,8 @@ again:
 	 * prepare_pages in the normal write path.
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c56daa3..de886ac 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1023,8 +1023,8 @@ again:
 			 page_start, page_end - 1, 0, &cached_state);
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-			  GFP_NOFS);
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+			  &cached_state, GFP_NOFS);
 
 	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -1035,8 +1035,8 @@ again:
 	}
 
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
-				  &cached_state);
+	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+			  &cached_state, GFP_NOFS);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 			     page_start, page_end - 1, &cached_state,
-- 
cgit v0.10.2


From dea31f52337c18f19eadfbbccb0c477942dad495 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 6 Sep 2012 16:47:00 -0400
Subject: Btrfs: wait on async pages when shrinking delalloc

Mitch reported a problem where you could get an ENOSPC error when untarring
a kernel git tree onto a 16gb file system with compress-force=zlib.  This is
because compression is a huge pain, it will return from ->writepages()
without having actually created any ordered extents.  To get around this we
check to see if the async submit counter is up, and if it is wait until it
drops to 0 before doing our normal ordered wait dance.  With this patch I
can now untar a kernel git tree onto a 16gb file system without getting
ENOSPC errors.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b6b33e4..06369d5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3697,6 +3697,13 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
 					       WB_REASON_FS_FREE_SPACE);
 
+		/*
+		 * We need to wait for the async pages to actually start before
+		 * we do anything.
+		 */
+		wait_event(root->fs_info->async_submit_wait,
+			   !atomic_read(&root->fs_info->async_delalloc_pages));
+
 		spin_lock(&space_info->lock);
 		if (space_info->bytes_used + space_info->bytes_reserved +
 		    space_info->bytes_pinned + space_info->bytes_readonly +
-- 
cgit v0.10.2


From a80c8dcf7e5065adc555ef8ffb256df11e3293e3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 6 Sep 2012 16:59:33 -0400
Subject: Btrfs: fix our overcommit math

I noticed I was seeing large lags when running my torrent test in a vm on my
laptop.  While trying to make it lag less I noticed that our overcommit math
was taking into account the number of bytes we wanted to reclaim, not the
number of bytes we actually wanted to allocate, which means we wouldn't
overcommit as often.  This patch fixes the overcommit math and makes
shrink_delalloc() use that logic so that it will stop looping faster.  We
still have pretty high spikes of latency, but the test now takes 3 minutes
less time (about 5% faster).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 06369d5..a8de1c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3663,6 +3663,46 @@ out:
 	return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  int flush)
+{
+	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 avail;
+	u64 used;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	avail = root->fs_info->free_chunk_space;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually useable.
+	 */
+	if (profile & (BTRFS_BLOCK_GROUP_DUP |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID10))
+		avail >>= 1;
+
+	/*
+	 * If we aren't flushing don't let us overcommit too much, say
+	 * 1/8th of the space.  If we can flush, let it overcommit up to
+	 * 1/2 of the space.
+	 */
+	if (flush)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3705,10 +3745,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 			   !atomic_read(&root->fs_info->async_delalloc_pages));
 
 		spin_lock(&space_info->lock);
-		if (space_info->bytes_used + space_info->bytes_reserved +
-		    space_info->bytes_pinned + space_info->bytes_readonly +
-		    space_info->bytes_may_use + orig <=
-		    space_info->total_bytes) {
+		if (can_overcommit(root, space_info, orig, !trans)) {
 			spin_unlock(&space_info->lock);
 			break;
 		}
@@ -3924,7 +3961,6 @@ again:
 	}
 
 	if (ret) {
-		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
 		/*
@@ -3945,30 +3981,7 @@ again:
 			goto again;
 		}
 
-		spin_lock(&root->fs_info->free_chunk_lock);
-		avail = root->fs_info->free_chunk_space;
-
-		/*
-		 * If we have dup, raid1 or raid10 then only half of the free
-		 * space is actually useable.
-		 */
-		if (profile & (BTRFS_BLOCK_GROUP_DUP |
-			       BTRFS_BLOCK_GROUP_RAID1 |
-			       BTRFS_BLOCK_GROUP_RAID10))
-			avail >>= 1;
-
-		/*
-		 * If we aren't flushing don't let us overcommit too much, say
-		 * 1/8th of the space.  If we can flush, let it overcommit up to
-		 * 1/2 of the space.
-		 */
-		if (flush)
-			avail >>= 3;
-		else
-			avail >>= 1;
-		 spin_unlock(&root->fs_info->free_chunk_lock);
-
-		if (used + num_bytes < space_info->total_bytes + avail) {
+		if (can_overcommit(root, space_info, orig_bytes, flush)) {
 			space_info->bytes_may_use += orig_bytes;
 			trace_btrfs_space_reservation(root->fs_info,
 				"space_info", space_info->flags, orig_bytes, 1);
-- 
cgit v0.10.2


From 837e197283199de640857192ca32767cb6e24fe8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 7 Sep 2012 03:00:48 -0600
Subject: btrfs: polish names of kmem caches

Usecase:

  watch 'grep btrfs < /proc/slabinfo'

easy to watch all caches in one go.

Signed-off-by: David Sterba <dsterba@suse.cz>

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb768c4..b26d2f9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
 
 int __init btrfs_delayed_inode_init(void)
 {
-	delayed_node_cache = kmem_cache_create("delayed_node",
+	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
 					sizeof(struct btrfs_delayed_node),
 					0,
 					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4a41b17..3ad84f5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,13 +64,13 @@ tree_fs_info(struct extent_io_tree *tree)
 
 int __init extent_io_init(void)
 {
-	extent_state_cache = kmem_cache_create("extent_state",
+	extent_state_cache = kmem_cache_create("btrfs_extent_state",
 			sizeof(struct extent_state), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = kmem_cache_create("extent_buffers",
+	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 			sizeof(struct extent_buffer), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ac606f0..8d1364d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = kmem_cache_create("extent_map",
+	extent_map_cache = kmem_cache_create("btrfs_extent_map",
 			sizeof(struct extent_map), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b21cf9..aae5562 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7155,31 +7155,31 @@ void btrfs_destroy_cachep(void)
 
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
 			sizeof(struct btrfs_trans_handle), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
 
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
 			sizeof(struct btrfs_transaction), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
 
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
 
-	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
 			sizeof(struct btrfs_free_space), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_free_space_cachep)
-- 
cgit v0.10.2


From 8407aa464331556e4f6784f974030b83fc7585ed Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 7 Sep 2012 01:43:32 -0600
Subject: Btrfs: fix corrupted metadata in the snapshot

When we delete a inode, we will remove all the delayed items including delayed
inode update, and then truncate all the relative metadata. If there is lots of
metadata, we will end the current transaction, and start a new transaction to
truncate the left metadata. In this way, we will leave a inode item that its
link counter is > 0, and also may leave some directory index items in fs/file tree
after the current transaction ends. In other words, the metadata in this fs/file tree
is inconsistent. If we create a snapshot for this tree now, we will find a inode with
corrupted metadata in the new snapshot, and we won't continue to drop the left metadata,
because its link counter is not 0.

We fix this problem by updating the inode item before the current transaction ends.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aae5562..9e9754a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3834,15 +3834,10 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_i_size_write(inode, 0);
 
 	/*
-	 * This is a bit simpler than btrfs_truncate since
-	 *
-	 * 1) We've already reserved our space for our orphan item in the
-	 *    unlink.
-	 * 2) We're going to delete the inode item, so we don't need to update
-	 *    it at all.
-	 *
-	 * So we just need to reserve some slack space in case we add bytes when
-	 * doing the truncate.
+	 * This is a bit simpler than btrfs_truncate since we've already
+	 * reserved our space for our orphan item in the unlink, so we just
+	 * need to reserve some slack space in case we add bytes and update
+	 * inode item when doing the truncate.
 	 */
 	while (1) {
 		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3863,7 +3858,7 @@ void btrfs_evict_inode(struct inode *inode)
 			goto no_delete;
 		}
 
-		trans = btrfs_start_transaction(root, 0);
+		trans = btrfs_start_transaction_noflush(root, 1);
 		if (IS_ERR(trans)) {
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
@@ -3876,6 +3871,10 @@ void btrfs_evict_inode(struct inode *inode)
 		if (ret != -ENOSPC)
 			break;
 
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a86fc72..f8ae448 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -290,7 +290,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-						    u64 num_items, int type)
+						    u64 num_items, int type,
+						    int noflush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
@@ -324,9 +325,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		}
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-		ret = btrfs_block_rsv_add(root,
-					  &root->fs_info->trans_block_rsv,
-					  num_bytes);
+		if (noflush)
+			ret = btrfs_block_rsv_add_noflush(root,
+						&root->fs_info->trans_block_rsv,
+						num_bytes);
+		else
+			ret = btrfs_block_rsv_add(root,
+						&root->fs_info->trans_block_rsv,
+						num_bytes);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -393,21 +399,28 @@ got_it:
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START);
+	return start_transaction(root, num_items, TRANS_START, 0);
 }
+
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+					struct btrfs_root *root, int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START, 1);
+}
+
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN);
+	return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_USERSPACE);
+	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1a138bf..1b05480 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -97,6 +97,8 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
-- 
cgit v0.10.2


From 0647d6bd16c03c928a22d1b90c4c8684c5908f08 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Fri, 7 Sep 2012 20:01:26 -0600
Subject: Btrfs: cleanup for unused ref cache stuff

As ref cache has been removed from btrfs, there is no user on
its lock and its check.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6923b9e..305002b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1376,9 +1376,6 @@ struct btrfs_fs_info {
 	struct rb_root defrag_inodes;
 	atomic_t defrag_running;
 
-	spinlock_t ref_cache_lock;
-	u64 total_ref_cache_size;
-
 	/*
 	 * these three are in extended format (availability of single
 	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34717ac..8054f7c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2000,7 +2000,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->trans_lock);
-	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -3214,10 +3213,6 @@ int close_ctree(struct btrfs_root *root)
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       (unsigned long long)fs_info->delalloc_bytes);
 	}
-	if (fs_info->total_ref_cache_size) {
-		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-		       (unsigned long long)fs_info->total_ref_cache_size);
-	}
 
 	free_extent_buffer(fs_info->extent_root->node);
 	free_extent_buffer(fs_info->extent_root->commit_root);
-- 
cgit v0.10.2


From dea7d76ecbfb53cda6aadd9bed33e87d255c5b02 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:27 -0600
Subject: Btrfs: update delayed ref's tracepoints to show sequence

We've added a new field 'sequence' to delayed ref node, so update related
tracepoints.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 91b91e8..54fab04 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -445,6 +445,7 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
 		__field(	u64,  ref_root		)
 		__field(	int,  level		)
 		__field(	int,  type		)
+		__field(	u64,  seq		)
 	),
 
 	TP_fast_assign(
@@ -455,17 +456,19 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
 		__entry->ref_root	= full_ref->root;
 		__entry->level		= full_ref->level;
 		__entry->type		= ref->type;
+		__entry->seq		= ref->seq;
 	),
 
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
-		  "type = %s",
+		  "type = %s, seq = %llu",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
-		  __entry->level, show_ref_type(__entry->type))
+		  __entry->level, show_ref_type(__entry->type),
+		  (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_data_ref,
@@ -485,6 +488,7 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		__field(	u64,  owner		)
 		__field(	u64,  offset		)
 		__field(	int,  type		)
+		__field(	u64,  seq		)
 	),
 
 	TP_fast_assign(
@@ -496,11 +500,12 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		__entry->owner		= full_ref->objectid;
 		__entry->offset		= full_ref->offset;
 		__entry->type		= ref->type;
+		__entry->seq		= ref->seq;
 	),
 
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
-		  "offset = %llu, type = %s",
+		  "offset = %llu, type = %s, seq = %llu",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
@@ -508,7 +513,8 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		  show_root_type(__entry->ref_root),
 		  (unsigned long long)__entry->owner,
 		  (unsigned long long)__entry->offset,
-		  show_ref_type(__entry->type))
+		  show_ref_type(__entry->type),
+		  (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_ref_head,
-- 
cgit v0.10.2


From 69917e431210f8712fe050f47b7561e7dae89521 Mon Sep 17 00:00:00 2001
From: Liu Bo <liub.liubo@gmail.com>
Date: Fri, 7 Sep 2012 20:01:28 -0600
Subject: Btrfs: fix a bug in parsing return value in logical resolve

In logical resolve, we parse extent_from_logical()'s 'ret' as a kind of flag.

It is possible to lose our errors because
(-EXXXX & BTRFS_EXTENT_FLAG_TREE_BLOCK) is true.

I'm not sure if it is on purpose, it just looks too hacky if it is.
I'd rather use a real flag and a 'ret' to catch errors.

Acked-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Liu Bo <liub.liubo@gmail.com>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e600857..ab286e5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1193,7 +1193,8 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
  * tree blocks and <0 on error.
  */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key)
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags_ret)
 {
 	int ret;
 	u64 flags;
@@ -1237,10 +1238,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 		 (unsigned long long)found_key->objectid,
 		 (unsigned long long)found_key->offset,
 		 (unsigned long long)flags, item_size);
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
-	if (flags & BTRFS_EXTENT_FLAG_DATA)
-		return BTRFS_EXTENT_FLAG_DATA;
+
+	WARN_ON(!flags_ret);
+	if (flags_ret) {
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+		else if (flags & BTRFS_EXTENT_FLAG_DATA)
+			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
+		else
+			BUG_ON(1);
+		return 0;
+	}
 
 	return -EIO;
 }
@@ -1433,15 +1441,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 {
 	int ret;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_key found_key;
 	int search_commit_root = path->search_commit_root;
 
-	ret = extent_from_logical(fs_info, logical, path,
-					&found_key);
+	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
 	btrfs_release_path(path);
 	if (ret < 0)
 		return ret;
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return -EINVAL;
 
 	extent_item_pos = logical - found_key.objectid;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc..4fda5d8 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,7 +40,8 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path);
 
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key);
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags);
 
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 				struct btrfs_extent_item *ei, u32 item_size,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index de886ac..48ff3f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3211,6 +3211,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	int ret = 0;
 	int size;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
@@ -3240,10 +3241,11 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+	ret = extent_from_logical(root->fs_info, loi->logical, path, &key,
+				  &flags);
 	btrfs_release_path(path);
 
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4e9eafe..d3bb901 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
-	u32 item_size;
-	int ret;
+	unsigned long ptr = 0;
+	u64 extent_item_pos;
+	u64 flags = 0;
 	u64 ref_root;
+	u32 item_size;
 	u8 ref_level;
-	unsigned long ptr = 0;
 	const int bufsize = 4096;
-	u64 extent_item_pos;
+	int ret;
 
 	path = btrfs_alloc_path();
 
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 		goto out;
 
-	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+				  &flags);
 	if (ret < 0)
 		goto out;
 
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 	btrfs_release_path(path);
 
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
 			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 							&ref_root, &ref_level);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e5c8679..c6ef070 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1152,6 +1152,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 disk_byte;
 	u64 num_bytes;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
 	struct backref_ctx *backref_ctx = NULL;
@@ -1198,13 +1199,13 @@ static int find_extent_clone(struct send_ctx *sctx,
 	}
 	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
-	ret = extent_from_logical(sctx->send_root->fs_info,
-			disk_byte, tmp_path, &found_key);
+	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+				  &found_key, &flags);
 	btrfs_release_path(tmp_path);
 
 	if (ret < 0)
 		goto out;
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		ret = -EIO;
 		goto out;
 	}
-- 
cgit v0.10.2


From df031f0752d50f2061df2847d57ea52a79f7977c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:29 -0600
Subject: Btrfs: use helper for logical resolve

We already have a helper, iterate_inodes_from_logical(), for logical resolve,
so just use it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48ff3f7..fa1284d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3210,12 +3210,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
 	int ret = 0;
 	int size;
-	u64 extent_item_pos;
-	u64 flags = 0;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -3241,23 +3238,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	ret = extent_from_logical(root->fs_info, loi->logical, path, &key,
-				  &flags);
-	btrfs_release_path(path);
-
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+					  build_ino_list, inodes);
+	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_item_pos = loi->logical - key.objectid;
-	ret = iterate_extent_inodes(root->fs_info, key.objectid,
-					extent_item_pos, 0, build_ino_list,
-					inodes);
-
-	if (ret < 0)
-		goto out;
-
 	ret = copy_to_user((void *)(unsigned long)loi->inodes,
 			   (void *)(unsigned long)inodes, size);
 	if (ret)
-- 
cgit v0.10.2


From 425d17a290c0c63785ec65db154a95c6337aeefa Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:30 -0600
Subject: Btrfs: use larger limit for translation of logical to inode

This is the change of the kernel side.

Translation of logical to inode used to have an upper limit 4k on
inode container's size, but the limit is not large enough for a data
with a great many of refs, so when resolving logical address,
we can end up with
"ioctl ret=0, bytes_left=0, bytes_missing=19944, cnt=510, missed=2493"

This changes to regard 64k as the upper limit and use vmalloc instead of
kmalloc to get memory more easily.

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ab286e5..7084431 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -1584,7 +1585,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
 	size_t alloc_bytes;
 
 	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
-	data = kmalloc(alloc_bytes, GFP_NOFS);
+	data = vmalloc(alloc_bytes);
 	if (!data)
 		return ERR_PTR(-ENOMEM);
 
@@ -1635,6 +1636,6 @@ void free_ipath(struct inode_fs_paths *ipath)
 {
 	if (!ipath)
 		return;
-	kfree(ipath->fspath);
+	vfree(ipath->fspath);
 	kfree(ipath);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1284d..38e3d6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3230,7 +3230,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	size = min_t(u32, loi->size, 4096);
+	size = min_t(u32, loi->size, 64 * 1024);
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);
@@ -3252,7 +3252,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 
 out:
 	btrfs_free_path(path);
-	kfree(inodes);
+	vfree(inodes);
 	kfree(loi);
 
 	return ret;
-- 
cgit v0.10.2


From 6df7881a84013f91405e5e113a4c322dd1804ba6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 5 Sep 2012 08:08:30 -0600
Subject: Btrfs: move the sb_end_intwrite until after the throttle logic

Sage reported the following lockdep backtrace

=====================================
[ BUG: bad unlock balance detected! ]
3.6.0-rc2-ceph-00171-gc7ed62d #1 Not tainted
-------------------------------------
btrfs-cleaner/7607 is trying to release lock (sb_internal) at:
[<ffffffffa00422ae>] btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
but there are no more locks to release!

other info that might help us debug this:
1 lock held by btrfs-cleaner/7607:
 #0:  (&fs_info->cleaner_mutex){+.+...}, at: [<ffffffffa003b405>] cleaner_kthread+0x95/0x120 [btrfs]

stack backtrace:
Pid: 7607, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00171-gc7ed62d #1
Call Trace:
 [<ffffffffa00422ae>] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff810afa9e>] print_unlock_inbalance_bug+0xfe/0x110
 [<ffffffff810b289e>] lock_release_non_nested+0x1ee/0x310
 [<ffffffff81172f9b>] ? kmem_cache_free+0x7b/0x160
 [<ffffffffa004106c>] ? put_transaction+0x8c/0x130 [btrfs]
 [<ffffffffa00422ae>] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff810b2a95>] lock_release+0xd5/0x220
 [<ffffffff81173071>] ? kmem_cache_free+0x151/0x160
 [<ffffffff8117d9ed>] __sb_end_write+0x7d/0x90
 [<ffffffffa00422ae>] btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff81079850>] ? __init_waitqueue_head+0x60/0x60
 [<ffffffff81634c6b>] ? _raw_spin_unlock+0x2b/0x40
 [<ffffffffa0042758>] __btrfs_end_transaction+0x368/0x3c0 [btrfs]
 [<ffffffffa0042808>] btrfs_end_transaction_throttle+0x18/0x20 [btrfs]
 [<ffffffffa00318f0>] btrfs_drop_snapshot+0x410/0x600 [btrfs]
 [<ffffffff8132babd>] ? do_raw_spin_unlock+0x5d/0xb0
 [<ffffffffa00430ef>] btrfs_clean_old_snapshots+0xaf/0x150 [btrfs]
 [<ffffffffa003b405>] ? cleaner_kthread+0x95/0x120 [btrfs]
 [<ffffffffa003b419>] cleaner_kthread+0xa9/0x120 [btrfs]
 [<ffffffffa003b370>] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs]
 [<ffffffff810791ee>] kthread+0xae/0xc0
 [<ffffffff810b379d>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
 [<ffffffff81635430>] ? retint_restore_args+0x13/0x13
 [<ffffffff81079140>] ? flush_kthread_work+0x1a0/0x1a0
 [<ffffffff8163e740>] ? gs_change+0x13/0x13

This is because the throttle stuff can commit the transaction, which expects to
be the one stopping the intwrite stuff, but we've already done it in the
__btrfs_end_transaction.  Moving the sb_end_intewrite after this logic makes the
lockdep go away.  Thanks,

Tested-by: Sage Weil <sage@inktank.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f8ae448..0629edf 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,8 +564,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
-	sb_end_intwrite(root->fs_info->sb);
-
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -586,6 +584,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	sb_end_intwrite(root->fs_info->sb);
+
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
 	atomic_dec(&cur_trans->num_writers);
-- 
cgit v0.10.2


From 69ffb54347a71c4f2a4694e0c1155af8538d55f6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 11 Sep 2012 15:40:07 -0400
Subject: Btrfs: create a pinned em when writing to a prealloc range in DIO

Wade Cline reported a problem where he was getting garbage and warnings when
writing to a preallocated range via O_DIRECT.  This is because we weren't
creating our normal pinned extent_map for the range we were writing to,
which was causing all sorts of issues.  This patch fixes the problem and
makes his testcase much happier.  Thanks,

Reported-by: Wade Cline <clinew@linux.vnet.ibm.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e9754a..406666c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5898,6 +5898,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 	return ret;
 }
 
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   int type)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	em = alloc_extent_map();
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	em->start = start;
+	em->orig_start = orig_start;
+	em->len = len;
+	em->block_len = block_len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	if (type == BTRFS_ORDERED_PREALLOC)
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+
+	do {
+		btrfs_drop_extent_cache(inode, em->start,
+				em->start + em->len - 1, 0);
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+	} while (ret == -EEXIST);
+
+	if (ret) {
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -6012,6 +6054,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 			goto must_cow;
 
 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+			u64 orig_start = em->start;
+
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				em = create_pinned_em(inode, start, len,
+						       orig_start,
+						       block_start, len, type);
+				if (IS_ERR(em)) {
+					btrfs_end_transaction(trans, root);
+					goto unlock_err;
+				}
+			}
+
 			ret = btrfs_add_ordered_extent_dio(inode, start,
 					   block_start, len, len, type);
 			btrfs_end_transaction(trans, root);
-- 
cgit v0.10.2


From 962197babeccc1f4cc8aa28ad844df80bdc85ed0 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 11 Sep 2012 23:27:35 -0600
Subject: Btrfs: fix unnecessary warning when the fragments make the space
 alloc fail

When we wrote some data by compress mode into a btrfs filesystem which was full
of the fragments, the kernel will report:
	BTRFS warning (device xxx): Aborting unused transaction.

The reason is:
We can not find a long enough free space to store the compressed data because
of the fragmentary free space, and the compressed data can not be splited,
so the kernel outputed the above message.

In fact, btrfs can deal with this problem very well: it fall back to
uncompressed IO, split the uncompressed data into small ones, and then
store them into to the fragmentary free space. So we shouldn't output the
above warning message.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 406666c..d41e467 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -662,7 +662,7 @@ retry:
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
 					   0, alloc_hint, &ins, 1);
-			if (ret)
+			if (ret && ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
 			btrfs_end_transaction(trans, root);
 		}
-- 
cgit v0.10.2


From be3940c0a90265654d778394cafe2e2cec674df8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 11 Sep 2012 14:23:05 -0600
Subject: btrfs: Kill some bi_idx references

For immutable bio vecs, I've been auditing and removing bi_idx
references. These were harmless, but removing them will make auditing
easier.

scrub_bio_end_io_worker() was open coding a bio_reset() - but this
doesn't appear to have been needed for anything as right after it does a
bio_put(), and perusing the code it doesn't appear anything else was
holding a reference to the bio.

The other use end_bio_extent_readpage() was just for a pr_debug() -
changed it to something that might be a bit more useful.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Chris Mason <chris.mason@oracle.com>
CC: Stefan Behrens <sbehrens@giantdisaster.de>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3ad84f5..90bd9f7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2306,8 +2306,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 
-		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
-			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+			 "mirror=%ld\n", (u64)bio->bi_sector, err,
 			 (long int)bio->bi_bdev);
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d3bb901..27892f6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1669,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 		scrub_block_put(sblock);
 	}
 
-	if (sbio->err) {
-		/* what is this good for??? */
-		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-		sbio->bio->bi_phys_segments = 0;
-		sbio->bio->bi_idx = 0;
-
-		for (i = 0; i < sbio->page_count; i++) {
-			struct bio_vec *bi;
-			bi = &sbio->bio->bi_io_vec[i];
-			bi->bv_offset = 0;
-			bi->bv_len = PAGE_SIZE;
-		}
-	}
-
 	bio_put(sbio->bio);
 	sbio->bio = NULL;
 	spin_lock(&sdev->list_lock);
-- 
cgit v0.10.2


From ea658badc47e614e38ab4d98510488474c7e6d4b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 11 Sep 2012 16:57:25 -0400
Subject: Btrfs: delay block group item insertion

So we have lots of places where we try to preallocate chunks in order to
make sure we have enough space as we make our allocations.  This has
historically meant that we're constantly tweaking when we should allocate a
new chunk, and historically we have gotten this horribly wrong so we way
over allocate either metadata or data.  To try and keep this from happening
we are going to make it so that the block group item insertion is done out
of band at the end of a transaction.  This will allow us to create chunks
even if we are trying to make an allocation for the extent tree.  With this
patch my enospc tests run faster (didn't expect this) and more efficiently
use the disk space (this is what I wanted).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 305002b..d66dc1c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1137,6 +1137,9 @@ struct btrfs_block_group_cache {
 	 * Today it will only have one thing on it, but that may change
 	 */
 	struct list_head cluster_list;
+
+	/* For delayed block group creation */
+	struct list_head new_bg_list;
 };
 
 /* delayed seq elem */
@@ -2865,6 +2868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a8de1c3..254fd32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2361,10 +2361,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 next:
-		do_chunk_alloc(trans, fs_info->extent_root,
-			       2 * 1024 * 1024,
-			       btrfs_get_alloc_profile(root, 0),
-			       CHUNK_ALLOC_NO_FORCE);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -2478,10 +2474,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
-	do_chunk_alloc(trans, root->fs_info->extent_root,
-		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-		       CHUNK_ALLOC_NO_FORCE);
-
 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2543,12 @@ again:
 	}
 
 	if (run_all) {
+		if (!list_empty(&trans->new_bgs)) {
+			spin_unlock(&delayed_refs->lock);
+			btrfs_create_pending_block_groups(trans, root);
+			spin_lock(&delayed_refs->lock);
+		}
+
 		node = rb_first(&delayed_refs->root);
 		if (!node)
 			goto out;
@@ -3826,7 +3824,8 @@ enum flush_state {
 	FLUSH_DELALLOC_WAIT	=	2,
 	FLUSH_DELAYED_ITEMS_NR	=	3,
 	FLUSH_DELAYED_ITEMS	=	4,
-	COMMIT_TRANS		=	5,
+	ALLOC_CHUNK		=	5,
+	COMMIT_TRANS		=	6,
 };
 
 static int flush_space(struct btrfs_root *root,
@@ -3863,6 +3862,20 @@ static int flush_space(struct btrfs_root *root,
 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
 		btrfs_end_transaction(trans, root);
 		break;
+	case ALLOC_CHUNK:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes,
+				     btrfs_get_alloc_profile(root, 0),
+				     CHUNK_ALLOC_NO_FORCE);
+		btrfs_end_transaction(trans, root);
+		if (ret == -ENOSPC)
+			ret = 0;
+		break;
 	case COMMIT_TRANS:
 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
 		break;
@@ -5515,8 +5528,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *used_block_group;
 	u64 search_start = 0;
 	int empty_cluster = 2 * 1024 * 1024;
-	int allowed_chunk_alloc = 0;
-	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
 	int index = 0;
@@ -5548,9 +5559,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	if (btrfs_mixed_space_info(space_info))
 		use_cluster = false;
 
-	if (orig_root->ref_cows || empty_size)
-		allowed_chunk_alloc = 1;
-
 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
@@ -5860,34 +5868,18 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
-		       if (allowed_chunk_alloc) {
-				ret = do_chunk_alloc(trans, root, num_bytes +
-						     2 * 1024 * 1024, data,
-						     CHUNK_ALLOC_LIMITED);
-				/*
-				 * Do not bail out on ENOSPC since we
-				 * can do more things.
-				 */
-				if (ret < 0 && ret != -ENOSPC) {
-					btrfs_abort_transaction(trans,
-								root, ret);
-					goto out;
-				}
-				allowed_chunk_alloc = 0;
-				if (ret == 1)
-					done_chunk_alloc = 1;
-			} else if (!done_chunk_alloc &&
-				   space_info->force_alloc ==
-				   CHUNK_ALLOC_NO_FORCE) {
-				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+			ret = do_chunk_alloc(trans, root, num_bytes +
+					     2 * 1024 * 1024, data,
+					     CHUNK_ALLOC_FORCE);
+			/*
+			 * Do not bail out on ENOSPC since we
+			 * can do more things.
+			 */
+			if (ret < 0 && ret != -ENOSPC) {
+				btrfs_abort_transaction(trans,
+							root, ret);
+				goto out;
 			}
-
-		       /*
-			* We didn't allocate a chunk, go ahead and drop the
-			* empty size and loop again.
-			*/
-		       if (!done_chunk_alloc)
-			       loop = LOOP_NO_EMPTY_SIZE;
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5962,20 +5954,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
 	data = btrfs_get_alloc_profile(root, data);
 again:
-	/*
-	 * the only place that sets empty_size is btrfs_realloc_node, which
-	 * is not called recursively on allocations
-	 */
-	if (empty_size || root->ref_cows) {
-		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data,
-				     CHUNK_ALLOC_NO_FORCE);
-		if (ret < 0 && ret != -ENOSPC) {
-			btrfs_abort_transaction(trans, root, ret);
-			return ret;
-		}
-	}
-
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       hint_byte, ins, data);
@@ -5985,12 +5963,6 @@ again:
 			num_bytes = num_bytes >> 1;
 			num_bytes = num_bytes & ~(root->sectorsize - 1);
 			num_bytes = max(num_bytes, min_alloc_size);
-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				       num_bytes, data, CHUNK_ALLOC_FORCE);
-			if (ret < 0 && ret != -ENOSPC) {
-				btrfs_abort_transaction(trans, root, ret);
-				return ret;
-			}
 			if (num_bytes == min_alloc_size)
 				final_tried = true;
 			goto again;
@@ -7828,6 +7800,34 @@ error:
 	return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+				 new_bg_list) {
+		list_del_init(&block_group->new_bg_list);
+
+		if (ret)
+			continue;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+	}
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7861,6 +7861,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	spin_lock_init(&cache->lock);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
 
 	btrfs_init_free_space_ctl(cache);
 
@@ -7892,12 +7893,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	BUG_ON(ret); /* Logic error */
 
-	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-				sizeof(cache->item));
-	if (ret) {
-		btrfs_abort_transaction(trans, extent_root, ret);
-		return ret;
-	}
+	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0629edf..c01dec7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -374,6 +374,7 @@ again:
 	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
 	if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -549,6 +550,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		trans->qgroup_reserved = 0;
 	}
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
@@ -564,6 +568,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -1400,6 +1407,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	cur_trans->delayed_refs.flushing = 1;
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	if (ret)
 		goto cleanup_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1b05480..b6463e1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -68,6 +68,7 @@ struct btrfs_trans_handle {
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
 	struct list_head qgroup_ref_list;
+	struct list_head new_bgs;
 };
 
 struct btrfs_pending_snapshot {
-- 
cgit v0.10.2


From 698d0082c4875a2ccc10b52ee8f415faad46b754 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 12 Sep 2012 14:08:47 -0400
Subject: Btrfs: remove bytes argument from do_chunk_alloc

Everybody is just making stuff up, and it's just used to see if we really do
need to alloc a chunk, and since we do this when we already know we really
do it's just a waste of space.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 254fd32..0bc83b3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     u64 flags, struct btrfs_disk_key *key,
 				     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force);
+			  struct btrfs_root *extent_root, u64 flags,
+			  int force);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -3404,7 +3404,6 @@ alloc:
 				return PTR_ERR(trans);
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     bytes + 2 * 1024 * 1024,
 					     alloc_target,
 					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
@@ -3486,8 +3485,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 
 static int should_alloc_chunk(struct btrfs_root *root,
-			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
-			      int force)
+			      struct btrfs_space_info *sinfo, int force)
 {
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3518,7 +3516,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 			return 1;
 	}
 
-	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
@@ -3568,8 +3566,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force)
+			  struct btrfs_root *extent_root, u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3593,7 +3590,7 @@ again:
 		return 0;
 	}
 
-	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+	if (!should_alloc_chunk(extent_root, space_info, force)) {
 		spin_unlock(&space_info->lock);
 		return 0;
 	} else if (space_info->chunk_alloc) {
@@ -3869,7 +3866,6 @@ static int flush_space(struct btrfs_root *root,
 			break;
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes,
 				     btrfs_get_alloc_profile(root, 0),
 				     CHUNK_ALLOC_NO_FORCE);
 		btrfs_end_transaction(trans, root);
@@ -5868,8 +5864,7 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
-			ret = do_chunk_alloc(trans, root, num_bytes +
-					     2 * 1024 * 1024, data,
+			ret = do_chunk_alloc(trans, root, data,
 					     CHUNK_ALLOC_FORCE);
 			/*
 			 * Do not bail out on ENOSPC since we
@@ -7269,7 +7264,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	alloc_flags = update_block_group_flags(root, cache->flags);
 	if (alloc_flags != cache->flags) {
-		ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+		ret = do_chunk_alloc(trans, root, alloc_flags,
 				     CHUNK_ALLOC_FORCE);
 		if (ret < 0)
 			goto out;
@@ -7279,7 +7274,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 	if (!ret)
 		goto out;
 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	ret = do_chunk_alloc(trans, root, alloc_flags,
 			     CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
@@ -7293,7 +7288,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 type)
 {
 	u64 alloc_flags = get_alloc_profile(root, type);
-	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	return do_chunk_alloc(trans, root, alloc_flags,
 			      CHUNK_ALLOC_FORCE);
 }
 
-- 
cgit v0.10.2


From 90abccf2c6e6e9c5a5d519eaed95292afa30aa11 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 13 Sep 2012 04:53:47 -0600
Subject: Revert "Btrfs: do not do filemap_write_and_wait_range in fsync"

This reverts commit 0885ef5b5601e9b007c383e77c172769b1f214fd

After applying the above patch, the performance slowed down because the dirty
page flush can only be done by one task, so revert it.

The following is the test result of sysbench:
	Before		After
	24MB/s		39MB/s

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0a4b03d..d0fc4c5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1544,12 +1544,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_btrfs_sync_file(file, datasync);
 
+	/*
+	 * We write the dirty pages in the range and wait until they complete
+	 * out of the ->i_mutex. If so, we can flush the dirty pages by
+	 * multi-task, and make the performance up.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
 	mutex_lock(&inode->i_mutex);
 
 	/*
-	 * we wait first, since the writeback may change the inode, also wait
-	 * ordered range does a filemape_write_and_wait_range which is why we
-	 * don't do it above like other file systems.
+	 * We flush the dirty pages again to avoid some dirty pages in the
+	 * range being left.
 	 */
 	atomic_inc(&root->log_batch);
 	btrfs_wait_ordered_range(inode, start, end);
-- 
cgit v0.10.2


From 0433f20d436fd040013e5a419a2e3d732d618a41 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:32 -0600
Subject: Btrfs: cleanup of error processing in btree_get_extent()

This patch simplifies a little complex error processing in
btree_get_extent().

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8054f7c..8db87bc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -222,21 +222,17 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em) {
-			ret = 0;
-		} else {
-			em = lookup_extent_mapping(em_tree, failed_start,
-						   failed_len);
-			ret = -EIO;
+		if (!em) {
+			lookup_extent_mapping(em_tree, failed_start,
+					      failed_len);
+			em = ERR_PTR(-EIO);
 		}
 	} else if (ret) {
 		free_extent_map(em);
-		em = NULL;
+		em = ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	if (ret)
-		em = ERR_PTR(ret);
 out:
 	return em;
 }
-- 
cgit v0.10.2


From b4f359ab065b9c6c132124557d23e90e2e8504be Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:54 -0600
Subject: Btrfs: remove unnecessary code in btree_get_extent()

Unnecessary lookup_extent_mapping() is removed because an error is
returned to the caller.
This patch was made based on the advice from Stefan Behrens, thanks.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8db87bc..8d633e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -217,16 +217,10 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
-		u64 failed_start = em->start;
-		u64 failed_len = em->len;
-
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em) {
-			lookup_extent_mapping(em_tree, failed_start,
-					      failed_len);
+		if (!em)
 			em = ERR_PTR(-EIO);
-		}
 	} else if (ret) {
 		free_extent_map(em);
 		em = ERR_PTR(ret);
-- 
cgit v0.10.2


From 1bcea35597693b3ac1ec1b311cfd42d52972a710 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Sep 2012 00:04:21 -0600
Subject: Btrfs: write_buf is now callable outside send.c

Developing service cmds needs it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c6ef070..c7beb54 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -386,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
 	return path;
 }
 
-static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 {
 	int ret;
 	mm_segment_t old_fs;
@@ -396,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
 	set_fs(KERNEL_DS);
 
 	while (pos < len) {
-		ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
-				&sctx->send_off);
+		ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
 		/* TODO handle that correctly */
 		/*if (ret == -ERESTARTSYS) {
 			continue;
@@ -553,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
 	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
 	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
 
-	return write_buf(sctx, &hdr, sizeof(hdr));
+	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+					&sctx->send_off);
 }
 
 /*
@@ -590,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
 	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	hdr->crc = cpu_to_le32(crc);
 
-	ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
+	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+					&sctx->send_off);
 
 	sctx->total_send_size += sctx->send_size;
 	sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e94..1bf4f32 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
 
 #ifdef __KERNEL__
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
 #endif
-- 
cgit v0.10.2


From ebb3dad4353b94c12a5cffab4397727c19f088e5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 13 Sep 2012 20:29:02 -0600
Subject: Btrfs: using for_each_set_bit_from to simplify the code

Using for_each_set_bit_from() to simplify the code.

spatch with a semantic match is used to found this.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acf..b107e68 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			  max_t(u64, *offset, bitmap_info->offset));
 	bits = bytes_to_bits(*bytes, ctl->unit);
 
-	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
 		if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
 again:
 	found_bits = 0;
-	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
 		if (next_zero - i >= min_bits) {
-- 
cgit v0.10.2


From b3ae244e7174d981c09ad7a6a68e7909d600aaca Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 13 Sep 2012 16:04:34 -0600
Subject: btrfs: return EPERM upon rmdir on a subvolume

A subvolume cannot be deleted via rmdir, but the error code ENOTEMPTY
is confusing. Return EPERM instead, as this is not permitted.

Signed-off-by: David Sterba <dsterba@suse.cz>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d41e467..1c50f7c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3200,9 +3200,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -EPERM;
 
 	trans = __unlink_start_trans(dir, dentry);
 	if (IS_ERR(trans))
-- 
cgit v0.10.2


From 60376ce4a8396bc5cd777be05b6a9bf044520f42 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 10:34:40 -0400
Subject: Btrfs: fix race in sync and freeze again

I screwed this up, there is a race between checking if there is a running
transaction and actually starting a transaction in sync where we could race
with a freezer and get ourselves into trouble.  To fix this we need to make
a new join type to only do the try lock on the freeze stuff.  If it fails
we'll return EPERM and just return from sync.  This fixes a hang Liu Bo
reported when running xfstest 68 in a loop.  Thanks,

Reported-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 867e8e7..903ab2d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -854,16 +854,13 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0, 0);
 
-	spin_lock(&fs_info->trans_lock);
-	if (!fs_info->running_transaction) {
-		spin_unlock(&fs_info->trans_lock);
-		return 0;
-	}
-	spin_unlock(&fs_info->trans_lock);
-
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
+	trans = btrfs_join_transaction_freeze(root);
+	if (IS_ERR(trans)) {
+		/* Frozen, don't bother */
+		if (PTR_ERR(trans) == -EPERM)
+			return 0;
 		return PTR_ERR(trans);
+	}
 	return btrfs_commit_transaction(trans, root);
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c01dec7..e4bfac8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -272,6 +272,7 @@ enum btrfs_trans_type {
 	TRANS_JOIN,
 	TRANS_USERSPACE,
 	TRANS_JOIN_NOLOCK,
+	TRANS_JOIN_FREEZE,
 };
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -341,7 +342,11 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	sb_start_intwrite(root->fs_info->sb);
+	if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
+		if (type == TRANS_JOIN_FREEZE)
+			return ERR_PTR(-EPERM);
+		sb_start_intwrite(root->fs_info->sb);
+	}
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
@@ -424,6 +429,11 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
+struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0);
+}
+
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b6463e1..fbf8313 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v0.10.2


From ab26e9d6c8a760aa5c4518eb2cb1a85516b3a4f7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:05 -0600
Subject: Btrfs: cleanup for duplicated code in find_free_extent

There is already an 'add free space' phrase in front of this one, we
needn't to redo it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0bc83b3..27a6b3e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5828,10 +5828,6 @@ checks:
 
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
-		if (offset < search_start)
-			btrfs_add_free_space(used_block_group, offset,
-					     search_start - offset);
-		BUG_ON(offset > search_start);
 		if (used_block_group != block_group)
 			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
-- 
cgit v0.10.2


From 2e90cf858fd2639eae2fb3f2173f7633cb15bf38 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:06 -0600
Subject: Btrfs: cleanup fs_info->hashers

fs_info->hashers is now an obsolete one.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d66dc1c..1372057 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1253,7 +1253,6 @@ struct btrfs_fs_info {
 	struct mutex reloc_mutex;
 
 	struct list_head trans_list;
-	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d633e3..5d3f813 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1984,7 +1984,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-- 
cgit v0.10.2


From 6bbe3a9c805fcb8cd8d396dafd32078181a7cdd5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:07 -0600
Subject: Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents

nocow_only is now an obsolete argument.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 27a6b3e..a3a902f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3722,7 +3722,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_ordered_extents(root, 0, 0);
+		btrfs_wait_ordered_extents(root, 0);
 		return;
 	}
 
@@ -3748,7 +3748,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_extents(root, 0, 0);
+			btrfs_wait_ordered_extents(root, 0);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e2b3d99..7772f02 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -468,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-				int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
 	struct list_head splice;
 	struct list_head *cur;
@@ -484,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
 		cur = splice.next;
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
-		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-			list_move(&ordered->root_extent_list,
-				  &root->fs_info->ordered_extents);
-			cond_resched_lock(&root->fs_info->ordered_extent_lock);
-			continue;
-		}
-
 		list_del_init(&ordered->root_extent_list);
 		atomic_inc(&ordered->refs);
 
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d1ddaef..dd27a0b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,8 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-				int nocow_only, int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7e7fd1b..6e530bb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4058,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	       (unsigned long long)rc->block_group->flags);
 
 	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 903ab2d..5aa3b81 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -852,7 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_ordered_extents(root, 0, 0);
+	btrfs_wait_ordered_extents(root, 0);
 
 	trans = btrfs_join_transaction_freeze(root);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e4bfac8..c9265a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1475,7 +1475,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		if (flush_on_commit || snap_pending) {
 			btrfs_start_delalloc_inodes(root, 1);
-			btrfs_wait_ordered_extents(root, 0, 1);
+			btrfs_wait_ordered_extents(root, 1);
 		}
 
 		ret = btrfs_run_delayed_items(trans, root);
-- 
cgit v0.10.2


From 98114659e0d467e2c0ee6f24f2429329328fc312 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 11:22:38 -0400
Subject: Btrfs: fix race with freeze and free space inodes

So we start our freeze, somebody comes in and does an fsync() on a file
where we have to commit a transaction for whatever reason, and we will
deadlock because the freeze is waiting on FS_FREEZE people to stop writing
to the file system, but the transaction is waiting for its free space inodes
to be written out, which are in turn waiting on sb_start_intwrite while
trying to write the file extents.  To fix this we'll just skip the
sb_start_intwrite() if we TRANS_JOIN_NOLOCK since we're being waited on by a
transaction commit so we're safe wrt to freeze and this will keep us from
deadlocking.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9265a6..a4fe549 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -342,7 +342,15 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
+	/*
+	 * If we are JOIN_NOLOCK we're already committing a transaction and
+	 * waiting on this guy, so we don't need to do the sb_start_intwrite
+	 * because we're already holding a ref.  We need this because we could
+	 * have raced in and did an fsync() on a file which can kick a commit
+	 * and then we deadlock with somebody doing a freeze.
+	 */
+	if (type != TRANS_JOIN_NOLOCK &&
+	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
 		if (type == TRANS_JOIN_FREEZE)
 			return ERR_PTR(-EPERM);
 		sb_start_intwrite(root->fs_info->sb);
@@ -601,7 +609,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	sb_end_intwrite(root->fs_info->sb);
+	if (lock)
+		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
-- 
cgit v0.10.2


From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 12:59:20 -0400
Subject: Btrfs: do not hold the write_lock on the extent tree while logging

Dave Sterba pointed out a sleeping while atomic bug while doing fsync.  This
is because I'm an idiot and didn't realize that rwlock's were spin locks, so
we've been holding this thing while doing allocations and such which is not
good.  This patch fixes this by dropping the write lock before we do
anything heavy and re-acquire it when it is done.  We also need to take a
ref on the em's in case their corresponding pages are evicted and mark them
as being logged so that releasepage does not remove them and doesn't remove
them from our local list.  Thanks,

Reported-by: Dave Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8d1364d..b8cbc8d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -407,7 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
-	list_del_init(&em->list);
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_del_init(&em->list);
 	em->in_tree = 0;
 	return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e6294b..6792255 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 
 struct extent_map {
 	struct rb_node rb_node;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 038a522..ed1f7ce 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2945,6 +2945,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		list_del_init(&em->list);
 		if (em->generation <= test_gen)
 			continue;
+		/* Need a ref to keep it from getting evicted from cache */
+		atomic_inc(&em->refs);
+		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
 		list_add_tail(&em->list, &extents);
 	}
 
@@ -2954,13 +2957,18 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		em = list_entry(extents.next, struct extent_map, list);
 
 		list_del_init(&em->list);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
 
 		/*
 		 * If we had an error we just need to delete everybody from our
 		 * private list.
 		 */
-		if (ret)
+		if (ret) {
+			free_extent_map(em);
 			continue;
+		}
+
+		write_unlock(&tree->lock);
 
 		/*
 		 * If the previous EM and the last extent we left off on aren't
@@ -2971,21 +2979,26 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 			ret = copy_items(trans, inode, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
-			if (ret)
+			if (ret) {
+				free_extent_map(em);
+				write_lock(&tree->lock);
 				continue;
+			}
 			btrfs_release_path(path);
 			args.nr = 0;
 		}
 
 		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+		free_extent_map(em);
+		write_lock(&tree->lock);
 	}
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
 
 	if (!ret && args.nr)
 		ret = copy_items(trans, inode, dst_path, args.src,
 				 args.start_slot, args.nr, LOG_INODE_ALL);
 	btrfs_release_path(path);
-	WARN_ON(!list_empty(&extents));
-	write_unlock(&tree->lock);
 	return ret;
 }
 
-- 
cgit v0.10.2


From b5bae2612af92fd8e7bcdcf7ce3e0259e8d341c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:43:01 -0400
Subject: Btrfs: fix race when getting the eb out of page->private

We can race when checking wether PagePrivate is set on a page and we
actually have an eb saved in the pages private pointer.  We could have
easily written out this page and released it in the time that we did the
pagevec lookup and actually got around to looking at this page.  So use
mapping->private_lock to ensure we get a consistent view of the
page->private pointer.  This is inline with the alloc and releasepage paths
which use private_lock when manipulating page->private.  Thanks,

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 90bd9f7..a2c2157 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3256,19 +3256,34 @@ retry:
 				break;
 			}
 
+			spin_lock(&mapping->private_lock);
+			if (!PagePrivate(page)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
 			eb = (struct extent_buffer *)page->private;
+
+			/*
+			 * Shouldn't happen and normally this would be a BUG_ON
+			 * but no sense in crashing the users box for something
+			 * we can survive anyway.
+			 */
 			if (!eb) {
+				spin_unlock(&mapping->private_lock);
 				WARN_ON(1);
 				continue;
 			}
 
-			if (eb == prev_eb)
+			if (eb == prev_eb) {
+				spin_unlock(&mapping->private_lock);
 				continue;
+			}
 
-			if (!atomic_inc_not_zero(&eb->refs)) {
-				WARN_ON(1);
+			ret = atomic_inc_not_zero(&eb->refs);
+			spin_unlock(&mapping->private_lock);
+			if (!ret)
 				continue;
-			}
 
 			prev_eb = eb;
 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
-- 
cgit v0.10.2


From 892951a92ebf8b390815b24f9418f297a5081898 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:51:53 -0400
Subject: Btrfs: remove unused write cache pages hook

The btree inode has it's own write cache pages so we can remove this write
cache pages hook as it's not used.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5d3f813..2c18a4e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3347,52 +3347,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-int btree_lock_page_hook(struct page *page, void *data,
-				void (*flush_fn)(void *))
-{
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_buffer *eb;
-
-	/*
-	 * We culled this eb but the page is still hanging out on the mapping,
-	 * carry on.
-	 */
-	if (!PagePrivate(page))
-		goto out;
-
-	eb = (struct extent_buffer *)page->private;
-	if (!eb) {
-		WARN_ON(1);
-		goto out;
-	}
-	if (page != eb->pages[0])
-		goto out;
-
-	if (!btrfs_try_tree_write_lock(eb)) {
-		flush_fn(data);
-		btrfs_tree_lock(eb);
-	}
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-
-	btrfs_tree_unlock(eb);
-out:
-	if (!trylock_page(page)) {
-		flush_fn(data);
-		lock_page(page);
-	}
-	return 0;
-}
-
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
@@ -3787,7 +3741,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
-- 
cgit v0.10.2


From 926ced123bd6651b30a07f65a2a8a0b26154cd58 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:58:59 -0400
Subject: Btrfs: don't do anything in our ->freeze_fs and ->unfreeze_fs

We do not need to do anything special to freeze or unfreeze, it's all taken
care of by the generic work, and what we currently have is wrong anyway
since we shouldn't be returnning to userspace with mutexes held anyway.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5aa3b81..fb260bb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1511,17 +1511,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_lock(&fs_info->transaction_kthread_mutex);
-	mutex_lock(&fs_info->cleaner_mutex);
 	return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_unlock(&fs_info->cleaner_mutex);
-	mutex_unlock(&fs_info->transaction_kthread_mutex);
 	return 0;
 }
 
-- 
cgit v0.10.2


From c3308f84c1743eabb91f4976a314d118d5ea2342 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 14:51:22 -0400
Subject: Btrfs: fix punch hole when no extent exists

I saw the warning in btrfs_drop_extent_cache where our end is less than our
start while running xfstests 68 in a loop.  This is because we
unconditionally do drop_end = min(end, extent_end) in
__btrfs_drop_extents(), even though we may not have found an extent in the
range we were looking to drop.  So keep track of wether or not we found
something, and if we didn't just use our end.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d0fc4c5..110d3cb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -609,6 +609,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int ret;
 	int modify_tree = -1;
 	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+	int found = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -674,6 +675,7 @@ next_slot:
 			goto next_slot;
 		}
 
+		found = 1;
 		search_start = max(key.offset, start);
 		if (recow || !modify_tree) {
 			modify_tree = -1;
@@ -829,7 +831,7 @@ next_slot:
 	}
 
 	if (drop_end)
-		*drop_end = min(end, extent_end);
+		*drop_end = found ? min(end, extent_end) : end;
 	btrfs_release_path(path);
 	return ret;
 }
-- 
cgit v0.10.2


From 7e97b8daf63487c20f78487bd4045f39b0d97cf4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 7 Sep 2012 05:56:55 -0600
Subject: btrfs: allow setting NOCOW for a zero sized file via ioctl

Hi,

the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.

In short, $subj says it, chattr -C supports it and we want to use it.

The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:

1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
     will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
     warn that the COW flag has not been changed
2.2.1) dtto, no syslog message

Man page of chattr states that

 "If it is set on a file which already has data blocks, it is undefined when
 the blocks assigned to the file will be fully stable."

Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.

My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.

The patch implements 2.2.1.

david

-------------8<-------------------
From: David Sterba <dsterba@suse.cz>

It's safe to turn off checksums for a zero sized file.

http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030

"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.

Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."

Signed-off-by: David Sterba <dsterba@suse.cz>

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 38e3d6a..4d7f4bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	int ret;
 	u64 ip_oldflags;
 	unsigned int i_oldflags;
+	umode_t mode;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	ip_oldflags = ip->flags;
 	i_oldflags = inode->i_flags;
+	mode = inode->i_mode;
 
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags |= BTRFS_INODE_DIRSYNC;
 	else
 		ip->flags &= ~BTRFS_INODE_DIRSYNC;
-	if (flags & FS_NOCOW_FL)
-		ip->flags |= BTRFS_INODE_NODATACOW;
-	else
-		ip->flags &= ~BTRFS_INODE_NODATACOW;
+	if (flags & FS_NOCOW_FL) {
+		if (S_ISREG(mode)) {
+			/*
+			 * It's safe to turn csums off here, no extents exist.
+			 * Otherwise we want the flag to reflect the real COW
+			 * status of the file and will not set it.
+			 */
+			if (inode->i_size == 0)
+				ip->flags |= BTRFS_INODE_NODATACOW
+					   | BTRFS_INODE_NODATASUM;
+		} else {
+			ip->flags |= BTRFS_INODE_NODATACOW;
+		}
+	} else {
+		/*
+		 * Revert back under same assuptions as above
+		 */
+		if (S_ISREG(mode)) {
+			if (inode->i_size == 0)
+				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				             | BTRFS_INODE_NODATASUM);
+		} else {
+			ip->flags &= ~BTRFS_INODE_NODATACOW;
+		}
+	}
 
 	/*
 	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
-- 
cgit v0.10.2


From aa42ffd918c420d5625b25b7a0bc2bbde4c9f890 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 18 Sep 2012 03:52:23 -0600
Subject: Btrfs: fix off-by-one in file clone

Btrfs uses inclusive range end for lock_extent(), unlock_extent() and
related functions, so we made off-by-one errors in file clone.

This fixes it and also fixes some style problems.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d7f4bb..d6836af 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2481,13 +2481,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	   another, and lock file content */
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
-		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+		lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
 		if (!ordered &&
-		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
-				   EXTENT_DELALLOC, 0, NULL))
+		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
+				    EXTENT_DELALLOC, 0, NULL))
 			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 		btrfs_wait_ordered_range(src, off, len);
@@ -2561,7 +2561,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_release_path(path);
 
 			if (key.offset + datal <= off ||
-			    key.offset >= off+len)
+			    key.offset >= off + len - 1)
 				goto next;
 
 			memcpy(&new_key, &key, sizeof(new_key));
@@ -2662,8 +2662,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					new_key.offset += skip;
 				}
 
-				if (key.offset + datal > off+len)
-					trim = key.offset + datal - (off+len);
+				if (key.offset + datal > off + len)
+					trim = key.offset + datal - (off + len);
 
 				if (comp && (skip || trim)) {
 					ret = -EINVAL;
@@ -2740,7 +2740,7 @@ next:
 	ret = 0;
 out:
 	btrfs_release_path(path);
-	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
 	mutex_unlock(&inode->i_mutex);
-- 
cgit v0.10.2


From 8732d44f806a9da9a7ca4d1704b8a1ed81639bc4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 17 Sep 2012 23:52:38 -0600
Subject: Btrfs: fix the missing error information in create_pending_snapshot()

The macro btrfs_abort_transaction() can get the line number of the code
where the problem happens, so we should invoke it in the place that the
error occurs, or we will lose the line number.

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a4fe549..910ff80 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1073,7 +1073,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		goto fail;
 	} else if (IS_ERR(dir_item)) {
 		ret = PTR_ERR(dir_item);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 	btrfs_release_path(path);
 
@@ -1084,8 +1085,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret)	/* Transaction aborted */
-		goto abort_trans;
+	if (ret) {	/* Transaction aborted */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -1118,7 +1121,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret) {
 		btrfs_tree_unlock(old);
 		free_extent_buffer(old);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 
 	btrfs_set_lock_blocking(old);
@@ -1127,8 +1131,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* clean up in any case */
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
@@ -1140,8 +1146,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/*
 	 * insert root back/forward references
@@ -1150,23 +1158,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	if (IS_ERR(pending->snap)) {
 		ret = PTR_ERR(pending->snap);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 
 	ret = btrfs_reloc_post_snapshot(trans, pending);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_insert_dir_item(trans, parent_root,
 				    dentry->d_name.name, dentry->d_name.len,
@@ -1174,15 +1189,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				    BTRFS_FT_DIR, index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
 					 dentry->d_name.len * 2);
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	if (ret)
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;
@@ -1193,10 +1210,6 @@ root_item_alloc_fail:
 path_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
-
-abort_trans:
-	btrfs_abort_transaction(trans, root, ret);
-	goto fail;
 }
 
 /*
-- 
cgit v0.10.2


From 005d6427ac4f276d937a36ca6a1d62b181ed70bf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 18 Sep 2012 07:52:32 -0600
Subject: btrfs: move transaction aborts to the point of failure

Call btrfs_abort_transaction as early as possible when an error
condition is detected, that way the line number reported is useful
and we're not clueless anymore which error path led to the abort.

Signed-off-by: David Sterba <dsterba@suse.cz>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1372057..399521a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3400,6 +3400,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
 	}
 }
 
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
 #define btrfs_abort_transaction(trans, root, errno)		\
 do {								\
 	__btrfs_abort_transaction(trans, root, __func__,	\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a3a902f..395e222 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5123,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			ret = remove_extent_backref(trans, extent_root, path,
 						    NULL, refs_to_drop,
 						    is_data);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			btrfs_release_path(path);
 			path->leave_spinning = 1;
 
@@ -5142,8 +5144,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 					btrfs_print_leaf(extent_root,
 							 path->nodes[0]);
 			}
-			if (ret < 0)
-				goto abort;
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			extent_slot = path->slots[0];
 		}
 	} else if (ret == -ENOENT) {
@@ -5157,7 +5161,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		       (unsigned long long)owner_objectid,
 		       (unsigned long long)owner_offset);
 	} else {
-		goto abort;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -5167,8 +5172,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(found_extent || extent_slot != path->slots[0]);
 		ret = convert_extent_item_v0(trans, extent_root, path,
 					     owner_objectid, 0);
-		if (ret < 0)
-			goto abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 
 		btrfs_release_path(path);
 		path->leave_spinning = 1;
@@ -5185,8 +5192,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       (unsigned long long)bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
-		if (ret < 0)
-			goto abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
 		extent_slot = path->slots[0];
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5223,8 +5233,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			ret = remove_extent_backref(trans, extent_root, path,
 						    iref, refs_to_drop,
 						    is_data);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 	} else {
 		if (found_extent) {
@@ -5241,27 +5253,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
 
 		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 	}
 out:
 	btrfs_free_path(path);
 	return ret;
-
-abort:
-	btrfs_abort_transaction(trans, extent_root, ret);
-	goto out;
 }
 
 /*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d..eb923d0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0)
-		goto out_abort;
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(trans, root, key, path,
 				-1, 1);
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
 		ret = btrfs_del_item(trans, root, path);
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
 		ret = btrfs_insert_empty_item(trans, root, path,
 				key, sizeof(*item));
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		l = path->nodes[0];
 		slot = path->slots[0];
 		ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
 	btrfs_free_path(path);
 	return ret;
-
-out_abort:
-	btrfs_abort_transaction(trans, root, ret);
-	goto out;
 }
 
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969a..4f4de51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1775,15 +1775,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 		ret = btrfs_finish_sprout(trans, root);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 	} else {
 		ret = btrfs_add_device(trans, root, device);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 	}
 
 	/*
@@ -1814,7 +1820,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 error_trans:
 	unlock_chunks(root);
-	btrfs_abort_transaction(trans, root, ret);
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
 	kfree(device);
@@ -3608,12 +3613,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
 				  &sys_chunk_size, &sys_stripe_size,
 				  sys_chunk_offset, alloc_profile);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	/*
 	 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3632,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	 */
 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
 				   chunk_size, stripe_size);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
 				   sys_chunk_offset, sys_chunk_size,
 				   sys_stripe_size);
 	if (ret)
-		goto abort;
+		btrfs_abort_transaction(trans, root, ret);
 
-	return 0;
+out:
 
-abort:
-	btrfs_abort_transaction(trans, root, ret);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 0aa4a17d8232e7d8a42763b53bb9943bc0484b18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 19 Sep 2012 15:42:38 -0400
Subject: Btrfs: handle not finding the extent exactly when logging changed
 extents

I started hitting warnings when running xfstest 68 in a loop because there
were EM's that were not lined up properly with the physical extents.  This
is ok, if we do something like punch a hole or write to a preallocated space
or something like that we can have an EM that doesn't cover the entire
physical extent.  So fix the tree logging stuff to cope with this case so we
don't just commit the transaction.  With this patch I no longer see the
warnings from the tree logging code.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ed1f7ce..e0ef92f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2833,6 +2833,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 start = em->mod_start;
+	u64 search_start = start;
 	u64 len = em->mod_len;
 	u64 num_bytes;
 	int nritems;
@@ -2848,23 +2849,55 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	while (len) {
 		if (args->nr)
 			goto next_slot;
+again:
 		key.objectid = btrfs_ino(inode);
 		key.type = BTRFS_EXTENT_DATA_KEY;
-		key.offset = start;
+		key.offset = search_start;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			return ret;
+
 		if (ret) {
 			/*
-			 * This shouldn't happen, but it might so warn and
-			 * return an error.
+			 * A rare case were we can have an em for a section of a
+			 * larger extent so we need to make sure that this em
+			 * falls within the extent we've found.  If not we just
+			 * bail and go back to ye-olde way of doing things but
+			 * it happens often enough in testing that we need to do
+			 * this dance to make sure.
 			 */
-			WARN_ON(1);
-			return -ENOENT;
+			do {
+				if (path->slots[0] == 0) {
+					btrfs_release_path(path);
+					if (search_start == 0)
+						return -ENOENT;
+					search_start--;
+					goto again;
+				}
+
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid != btrfs_ino(inode) ||
+				    key.type != BTRFS_EXTENT_DATA_KEY) {
+					btrfs_release_path(path);
+					return -ENOENT;
+				}
+			} while (key.offset > start);
+
+			fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					    struct btrfs_file_extent_item);
+			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+								fi);
+			if (key.offset + num_bytes <= start) {
+				btrfs_release_path(path);
+				return -ENOENT;
+			}
 		}
 		args->src = path->nodes[0];
 next_slot:
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 		fi = btrfs_item_ptr(args->src, path->slots[0],
 				    struct btrfs_file_extent_item);
 		if (args->nr &&
@@ -2898,8 +2931,9 @@ next_slot:
 		} else {
 			len -= num_bytes;
 		}
-		start += btrfs_file_extent_num_bytes(args->src, fi);
+		start = key.offset + num_bytes;
 		args->next_offset = start;
+		search_start = start;
 
 		if (path->slots[0] < nritems) {
 			if (len)
-- 
cgit v0.10.2


From 5a1d7843ca4b3a9009bea87f85ad33854b910aea Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 17 Aug 2012 14:04:41 -0700
Subject: btrfs: improved readablity for add_inode_ref

Moved part of the code into a sub function and replaced most of the gotos
by ifs, hoping that it will be easier to read now.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e0ef92f..15dae58 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -785,76 +785,18 @@ out:
 	return match;
 }
 
-
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  struct btrfs_root *log,
 				  struct btrfs_path *path,
-				  struct extent_buffer *eb, int slot,
-				  struct btrfs_key *key)
+				  struct btrfs_root *log_root,
+				  struct inode *dir, struct inode *inode,
+				  struct btrfs_key *key,
+				  struct extent_buffer *eb,
+				  struct btrfs_inode_ref *ref,
+				  char *name, int namelen, int *search_done)
 {
-	struct btrfs_inode_ref *ref;
-	struct btrfs_dir_item *di;
-	struct inode *dir;
-	struct inode *inode;
-	unsigned long ref_ptr;
-	unsigned long ref_end;
-	char *name;
-	int namelen;
 	int ret;
-	int search_done = 0;
-
-	/*
-	 * it is possible that we didn't log all the parent directories
-	 * for a given inode.  If we don't find the dir, just don't
-	 * copy the back ref in.  The link count fixup code will take
-	 * care of the rest
-	 */
-	dir = read_one_inode(root, key->offset);
-	if (!dir)
-		return -ENOENT;
-
-	inode = read_one_inode(root, key->objectid);
-	if (!inode) {
-		iput(dir);
-		return -EIO;
-	}
-
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
-
-again:
-	ref = (struct btrfs_inode_ref *)ref_ptr;
-
-	namelen = btrfs_inode_ref_name_len(eb, ref);
-	name = kmalloc(namelen, GFP_NOFS);
-	BUG_ON(!name);
-
-	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-
-	/* if we already have a perfect match, we're done */
-	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-			 btrfs_inode_ref_index(eb, ref),
-			 name, namelen)) {
-		goto out;
-	}
-
-	/*
-	 * look for a conflicting back reference in the metadata.
-	 * if we find one we have to unlink that name of the file
-	 * before we add our new link.  Later on, we overwrite any
-	 * existing back reference, and we don't want to create
-	 * dangling pointers in the directory.
-	 */
-
-	if (search_done)
-		goto insert;
+	struct btrfs_dir_item *di;
 
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 	if (ret == 0) {
@@ -869,7 +811,7 @@ again:
 		 * if so, just jump out, we're done
 		 */
 		if (key->objectid == key->offset)
-			goto out_nowrite;
+			return 1;
 
 		/* check all the names in this back reference to see
 		 * if they are in the log.  if so, we allow them to stay
@@ -888,7 +830,7 @@ again:
 					   (unsigned long)(victim_ref + 1),
 					   victim_name_len);
 
-			if (!backref_in_log(log, key, victim_name,
+			if (!backref_in_log(log_root, key, victim_name,
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(path);
@@ -907,7 +849,7 @@ again:
 		 * NOTE: we have searched root tree and checked the
 		 * coresponding ref, it does not need to check again.
 		 */
-		search_done = 1;
+		*search_done = 1;
 	}
 	btrfs_release_path(path);
 
@@ -930,25 +872,99 @@ again:
 	}
 	btrfs_release_path(path);
 
-insert:
-	/* insert our name */
-	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-			     btrfs_inode_ref_index(eb, ref));
-	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct btrfs_inode_ref *ref;
+	struct inode *dir;
+	struct inode *inode;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	char *name;
+	int namelen;
+	int ret;
+	int search_done = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
 
-	btrfs_update_inode(trans, root, inode);
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		iput(dir);
+		return -EIO;
+	}
 
-out:
-	ref_ptr = (unsigned long)(ref + 1) + namelen;
-	kfree(name);
-	if (ref_ptr < ref_end)
-		goto again;
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	while (ref_ptr < ref_end) {
+		ref = (struct btrfs_inode_ref *)ref_ptr;
+
+		namelen = btrfs_inode_ref_name_len(eb, ref);
+		name = kmalloc(namelen, GFP_NOFS);
+		BUG_ON(!name);
+
+		read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+		/* if we already have a perfect match, we're done */
+		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+				  btrfs_inode_ref_index(eb, ref),
+				  name, namelen)) {
+			/*
+			 * look for a conflicting back reference in the
+			 * metadata. if we find one we have to unlink that name
+			 * of the file before we add our new link.  Later on, we
+			 * overwrite any existing back reference, and we don't
+			 * want to create dangling pointers in the directory.
+			 */
+
+			if (!search_done) {
+				ret = __add_inode_ref(trans, root, path, log,
+						      dir, inode, key, eb, ref,
+						      name, namelen,
+						      &search_done);
+				if (ret == 1)
+					goto out;
+				BUG_ON(ret);
+			}
+
+			/* insert our name */
+			ret = btrfs_add_link(trans, dir, inode, name, namelen,
+					     0, btrfs_inode_ref_index(eb, ref));
+			BUG_ON(ret);
+
+			btrfs_update_inode(trans, root, inode);
+		}
+
+		ref_ptr = (unsigned long)(ref + 1) + namelen;
+		kfree(name);
+	}
 
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
 
-out_nowrite:
+out:
 	btrfs_release_path(path);
 	iput(dir);
 	iput(inode);
-- 
cgit v0.10.2


From f186373fef005cee948a4a39e6a14c2e5f517298 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Wed, 8 Aug 2012 11:32:27 -0700
Subject: btrfs: extended inode refs

This patch adds basic support for extended inode refs. This includes support
for link and unlink of the refs, which basically gets us support for rename
as well.

Inode creation does not need changing - extended refs are only added after
the ref array is full.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7084431..dc96312 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1109,6 +1109,74 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 				found_key);
 }
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off)
+{
+	int ret, slot;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = start_off;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * If the item at offset is not found,
+			 * btrfs_search_slot will point us to the slot
+			 * where it should be inserted. In our case
+			 * that will be the slot directly before the
+			 * next INODE_REF_KEY_V2 item. In the case
+			 * that we're pointing to the last slot in a
+			 * leaf, we must move one leaf over.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				if (ret >= 1)
+					ret = -ENOENT;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/*
+		 * Check that we're still looking at an extended ref key for
+		 * this particular objectid. If we have different
+		 * objectid or type then there are no more to be found
+		 * in the tree and we can exit.
+		 */
+		ret = -ENOENT;
+		if (found_key.objectid != inode_objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		ret = 0;
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		extref = (struct btrfs_inode_extref *)ptr;
+		*ret_extref = extref;
+		if (found_off)
+			*found_off = found_key.offset;
+		break;
+	}
+
+	return ret;
+}
+
 /*
  * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
  * of the path are separated by '/' and the path is guaranteed to be
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 4fda5d8..0b920c1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -70,4 +70,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
 void free_ipath(struct inode_fs_paths *ipath);
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off);
+
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 399521a..50dcd0f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_NAME_LEN 255
 
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
 
@@ -489,6 +496,8 @@ struct btrfs_super_block {
  */
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
 
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
@@ -496,7 +505,8 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
 	/* name goes here */
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_extref {
+	__le64 parent_objectid;
+	__le64 index;
+	__le16 name_len;
+	__u8   name[0];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
@@ -1601,6 +1619,7 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		12
+#define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1987,6 +2006,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -3184,12 +3210,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -3197,6 +3223,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
 
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+				   u64 ref_objectid, const char *name,
+				   int name_len,
+				   struct btrfs_inode_extref **extref_ret);
+
 /* file-item.c */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff97..1d98281 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 {
 	return crc32c((u32)~1, name, len);
 }
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+				    int len)
+{
+	return (u64) crc32c(parent_objectid, name, len);
+}
+
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a..48b8fda 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
 
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 	return 0;
 }
 
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+				   const char *name, int name_len,
+				   struct btrfs_inode_extref **extref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_extref *extref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	/*
+	 * Search all extended backrefs in this item. We're only
+	 * looking through any collisions so most of the time this is
+	 * just going to compare against one buffer. If all is well,
+	 * we'll return success and the inode ref object.
+	 */
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+		name_ptr = (unsigned long)(&extref->name);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+		if (ref_name_len == name_len &&
+		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+			if (extref_ret)
+				*extref_ret = extref;
+			return 1;
+		}
+
+		cur_offset += ref_name_len + sizeof(*extref);
+	}
+	return 0;
+}
+
+static struct btrfs_inode_ref *
 btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod)
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       const char *name, int name_len,
+		       u64 inode_objectid, u64 ref_objectid, int ins_len,
+		       int cow)
 {
+	int ret;
 	struct btrfs_key key;
 	struct btrfs_inode_ref *ref;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	int ret;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
 	return ref;
 }
 
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+		return NULL;
+	return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index)
+{
+	struct btrfs_inode_ref *ref;
+	struct btrfs_inode_extref *extref;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+				     inode_objectid, ref_objectid, ins_len,
+				     cow);
+	if (IS_ERR(ref))
+		return PTR_ERR(ref);
+
+	if (ref != NULL) {
+		*ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+		return 0;
+	}
+
+	btrfs_release_path(path);
+
+	extref = btrfs_lookup_inode_extref(trans, root, path, name,
+					   name_len, inode_objectid,
+					   ref_objectid, ins_len, cow);
+	if (IS_ERR(extref))
+		return PTR_ERR(extref);
+
+	if (extref) {
+		*ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	int ret;
+	int del_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Sanity check - did we find the right item for this name?
+	 * This should always succeed so error here will make the FS
+	 * readonly.
+	 */
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+					    name, name_len, &extref)) {
+		btrfs_std_error(root->fs_info, -ENOENT);
+		ret = -EROFS;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (index)
+		*index = btrfs_inode_extref_index(leaf, extref);
+
+	if (del_len == item_size) {
+		/*
+		 * Common case only one ref in the item, remove the
+		 * whole item.
+		 */
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+
+	ptr = (unsigned long)extref;
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	memmove_extent_buffer(leaf, ptr, ptr + del_len,
+			      item_size - (ptr + del_len - item_start));
+
+	btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
 	struct btrfs_inode_ref *ref;
 	struct extent_buffer *leaf;
 	unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	u32 sub_item_len;
 	int ret;
+	int search_ext_refs = 0;
 	int del_len = name_len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	} else if (ret < 0) {
 		goto out;
 	}
 	if (!find_name_in_backref(path, name, name_len, &ref)) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(trans, root, path,
-				  item_size - sub_item_len, 1);
+	btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+
+	if (search_ext_refs) {
+		/*
+		 * No refs were found, or we could not find the
+		 * name in our ref array. Find and remove the extended
+		 * inode ref then.
+		 */
+		return btrfs_del_inode_extref(trans, root, name, name_len,
+					      inode_objectid, ref_objectid, index);
+	}
+
+	return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_inode_extref *extref;
+	int ret;
+	int ins_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, name_len, NULL))
+			goto out;
+
+		btrfs_extend_item(trans, root, path, ins_len);
+		ret = 0;
+	}
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+	ptr += btrfs_item_size(leaf, item) - ins_len;
+	extref = (struct btrfs_inode_extref *)ptr;
+
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+	ptr = (unsigned long)&extref->name;
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 
 out:
 	btrfs_free_path(path);
+
+	if (ret == -EMLINK) {
+		struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+		/* We ran out of space in the ref array. Need to
+		 * add an extended ref. */
+		if (btrfs_super_incompat_flags(disk_super)
+		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+			ret = btrfs_insert_inode_extref(trans, root, name,
+							name_len,
+							inode_objectid,
+							ref_objectid, index);
+	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c50f7c..596305e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2903,7 +2903,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
-	struct btrfs_inode_ref *ref;
 	struct btrfs_dir_item *di;
 	struct inode *inode = dentry->d_inode;
 	u64 index;
@@ -3017,17 +3016,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	}
 	btrfs_release_path(path);
 
-	ref = btrfs_lookup_inode_ref(trans, root, path,
-				dentry->d_name.name, dentry->d_name.len,
-				ino, dir_ino, 0);
-	if (IS_ERR(ref)) {
-		err = PTR_ERR(ref);
+	ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
+					dentry->d_name.len, ino, dir_ino, 0,
+					&index);
+	if (ret) {
+		err = ret;
 		goto out;
 	}
-	BUG_ON(!ref); /* Logic error */
+
 	if (check_path_shared(root, path))
 		goto out;
-	index = btrfs_inode_ref_index(path->nodes[0], ref);
+
 	btrfs_release_path(path);
 
 	/*
@@ -4743,6 +4742,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
 
+	/*
+	 * Start new inodes with an inode_ref. This is slightly more
+	 * efficient for small numbers of hard links since they will
+	 * be packed into one item. Extended refs will kick in if we
+	 * add more hard links than can fit in the ref item.
+	 */
 	key[1].objectid = objectid;
 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
 	key[1].offset = ref_objectid;
@@ -5049,7 +5054,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EXDEV;
 
-	if (inode->i_nlink == ~0U)
+	if (inode->i_nlink >= BTRFS_LINK_MAX)
 		return -EMLINK;
 
 	err = btrfs_set_inode_index(dir, &index);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 15dae58..1d7b348 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -24,8 +24,10 @@
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
+#include "backref.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "hash.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -743,6 +745,7 @@ out:
  */
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
+				   u64 ref_objectid,
 				   char *name, int namelen)
 {
 	struct btrfs_path *path;
@@ -763,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	if (ret != 0)
 		goto out;
 
-	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, namelen, NULL))
+			match = 1;
+
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		ref = (struct btrfs_inode_ref *)ptr;
@@ -790,27 +802,36 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_path *path,
 				  struct btrfs_root *log_root,
 				  struct inode *dir, struct inode *inode,
-				  struct btrfs_key *key,
 				  struct extent_buffer *eb,
-				  struct btrfs_inode_ref *ref,
-				  char *name, int namelen, int *search_done)
+				  u64 inode_objectid, u64 parent_objectid,
+				  u64 ref_index, char *name, int namelen,
+				  int *search_done)
 {
 	int ret;
+	char *victim_name;
+	int victim_name_len;
+	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
+	struct btrfs_key search_key;
+	struct btrfs_inode_extref *extref;
 
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+again:
+	/* Search old style refs */
+	search_key.objectid = inode_objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = parent_objectid;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret == 0) {
-		char *victim_name;
-		int victim_name_len;
 		struct btrfs_inode_ref *victim_ref;
 		unsigned long ptr;
 		unsigned long ptr_end;
-		struct extent_buffer *leaf = path->nodes[0];
+
+		leaf = path->nodes[0];
 
 		/* are we trying to overwrite a back ref for the root directory
 		 * if so, just jump out, we're done
 		 */
-		if (key->objectid == key->offset)
+		if (search_key.objectid == search_key.offset)
 			return 1;
 
 		/* check all the names in this back reference to see
@@ -830,7 +851,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 					   (unsigned long)(victim_ref + 1),
 					   victim_name_len);
 
-			if (!backref_in_log(log_root, key, victim_name,
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid,
+					    victim_name,
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(path);
@@ -838,9 +861,14 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
+				BUG_ON(ret);
 				btrfs_run_delayed_items(trans, root);
+				kfree(victim_name);
+				*search_done = 1;
+				goto again;
 			}
 			kfree(victim_name);
+
 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 		}
 		BUG_ON(ret);
@@ -853,10 +881,74 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
+	/* Same search but for extended refs */
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+					   inode_objectid, parent_objectid, 0,
+					   0);
+	if (!IS_ERR_OR_NULL(extref)) {
+		u32 item_size;
+		u32 cur_offset = 0;
+		unsigned long base;
+		struct inode *victim_parent;
+
+		leaf = path->nodes[0];
+
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+				goto next;
+
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+					   victim_name_len);
+
+			search_key.objectid = inode_objectid;
+			search_key.type = BTRFS_INODE_EXTREF_KEY;
+			search_key.offset = btrfs_extref_hash(parent_objectid,
+							      victim_name,
+							      victim_name_len);
+			ret = 0;
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid, victim_name,
+					    victim_name_len)) {
+				ret = -ENOENT;
+				victim_parent = read_one_inode(root,
+							       parent_objectid);
+				if (victim_parent) {
+					btrfs_inc_nlink(inode);
+					btrfs_release_path(path);
+
+					ret = btrfs_unlink_inode(trans, root,
+								 victim_parent,
+								 inode,
+								 victim_name,
+								 victim_name_len);
+					btrfs_run_delayed_items(trans, root);
+				}
+				BUG_ON(ret);
+				iput(victim_parent);
+				kfree(victim_name);
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+			BUG_ON(ret);
+next:
+			cur_offset += victim_name_len + sizeof(*extref);
+		}
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
 	/* look for a conflicting sequence number */
 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-					 btrfs_inode_ref_index(eb, ref),
-					 name, namelen, 0);
+					 ref_index, name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
 		BUG_ON(ret);
@@ -875,6 +967,48 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
+{
+	struct btrfs_inode_extref *extref;
+
+	extref = (struct btrfs_inode_extref *)ref_ptr;
+
+	*namelen = btrfs_inode_extref_name_len(eb, extref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+			   *namelen);
+
+	*index = btrfs_inode_extref_index(eb, extref);
+	if (parent_objectid)
+		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+	return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
+{
+	struct btrfs_inode_ref *ref;
+
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	*namelen = btrfs_inode_ref_name_len(eb, ref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+	*index = btrfs_inode_ref_index(eb, ref);
+
+	return 0;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -888,7 +1022,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb, int slot,
 				  struct btrfs_key *key)
 {
-	struct btrfs_inode_ref *ref;
 	struct inode *dir;
 	struct inode *inode;
 	unsigned long ref_ptr;
@@ -897,6 +1030,27 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	int namelen;
 	int ret;
 	int search_done = 0;
+	int log_ref_ver = 0;
+	u64 parent_objectid;
+	u64 inode_objectid;
+	u64 ref_index;
+	int ref_struct_size;
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *r;
+
+		ref_struct_size = sizeof(struct btrfs_inode_extref);
+		log_ref_ver = 1;
+		r = (struct btrfs_inode_extref *)ref_ptr;
+		parent_objectid = btrfs_inode_extref_parent(eb, r);
+	} else {
+		ref_struct_size = sizeof(struct btrfs_inode_ref);
+		parent_objectid = key->offset;
+	}
+	inode_objectid = key->objectid;
 
 	/*
 	 * it is possible that we didn't log all the parent directories
@@ -904,32 +1058,38 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	 * copy the back ref in.  The link count fixup code will take
 	 * care of the rest
 	 */
-	dir = read_one_inode(root, key->offset);
+	dir = read_one_inode(root, parent_objectid);
 	if (!dir)
 		return -ENOENT;
 
-	inode = read_one_inode(root, key->objectid);
+	inode = read_one_inode(root, inode_objectid);
 	if (!inode) {
 		iput(dir);
 		return -EIO;
 	}
 
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
-
 	while (ref_ptr < ref_end) {
-		ref = (struct btrfs_inode_ref *)ref_ptr;
-
-		namelen = btrfs_inode_ref_name_len(eb, ref);
-		name = kmalloc(namelen, GFP_NOFS);
-		BUG_ON(!name);
-
-		read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+		if (log_ref_ver) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
+			/*
+			 * parent object can change from one array
+			 * item to another.
+			 */
+			if (!dir)
+				dir = read_one_inode(root, parent_objectid);
+			if (!dir)
+				return -ENOENT;
+		} else {
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
+		}
+		if (ret)
+			return ret;
 
 		/* if we already have a perfect match, we're done */
 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-				  btrfs_inode_ref_index(eb, ref),
-				  name, namelen)) {
+				  ref_index, name, namelen)) {
 			/*
 			 * look for a conflicting back reference in the
 			 * metadata. if we find one we have to unlink that name
@@ -940,8 +1100,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			if (!search_done) {
 				ret = __add_inode_ref(trans, root, path, log,
-						      dir, inode, key, eb, ref,
-						      name, namelen,
+						      dir, inode, eb,
+						      inode_objectid,
+						      parent_objectid,
+						      ref_index, name, namelen,
 						      &search_done);
 				if (ret == 1)
 					goto out;
@@ -950,14 +1112,18 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			/* insert our name */
 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
-					     0, btrfs_inode_ref_index(eb, ref));
+					     0, ref_index);
 			BUG_ON(ret);
 
 			btrfs_update_inode(trans, root, inode);
 		}
 
-		ref_ptr = (unsigned long)(ref + 1) + namelen;
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
 		kfree(name);
+		if (log_ref_ver) {
+			iput(dir);
+			dir = NULL;
+		}
 	}
 
 	/* finally write the back reference in the inode */
@@ -981,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int count_inode_extrefs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret = 0;
+	int name_len;
+	unsigned int nlink = 0;
+	u32 item_size;
+	u32 cur_offset = 0;
+	u64 inode_objectid = btrfs_ino(inode);
+	u64 offset = 0;
+	unsigned long ptr;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
 
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay.  So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found.  If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct inode *inode)
+	while (1) {
+		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+					    &extref, &offset);
+		if (ret)
+			break;
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+			name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			nlink++;
+
+			cur_offset += name_len + sizeof(*extref);
+		}
+
+		offset++;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	if (ret < 0)
+		return ret;
+	return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
 {
-	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	u64 nlink = 0;
+	unsigned int nlink = 0;
 	unsigned long ptr;
 	unsigned long ptr_end;
 	int name_len;
@@ -1009,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -1046,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		btrfs_release_path(path);
 	}
 	btrfs_release_path(path);
+
+	return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	u64 nlink = 0;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = count_inode_refs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink = ret;
+
+	ret = count_inode_extrefs(root, inode, path);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	if (ret < 0)
+		goto out;
+
+	nlink += ret;
+
+	ret = 0;
+
 	if (nlink != inode->i_nlink) {
 		set_nlink(inode, nlink);
 		btrfs_update_inode(trans, root, inode);
@@ -1061,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		ret = insert_orphan_item(trans, root, ino);
 		BUG_ON(ret);
 	}
-	btrfs_free_path(path);
 
-	return 0;
+out:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1710,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = add_inode_ref(wc->trans, root, log, path,
 					    eb, i, &key);
 			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
-- 
cgit v0.10.2


From d24bec3ae528a47149b838aad76c006d40fe8a39 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Wed, 8 Aug 2012 11:33:54 -0700
Subject: btrfs: extended inode ref iteration

The iterate_irefs in backref.c is used to build path components from inode
refs. This patch adds code to iterate extended refs as well.

I had modify the callback function signature to abstract out some of the
differences between ref structures. iref_to_path() also needed similar
changes.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index dc96312..f318793 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1177,26 +1177,12 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 	return ret;
 }
 
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-			 struct btrfs_inode_ref *iref,
+static char *ref_to_path(struct btrfs_root *fs_root,
+			 struct btrfs_path *path,
+			 u32 name_len, unsigned long name_off,
 			 struct extent_buffer *eb_in, u64 parent,
 			 char *dest, u32 size)
 {
-	u32 len;
 	int slot;
 	u64 next_inum;
 	int ret;
@@ -1204,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
 	int leave_spinning = path->leave_spinning;
+	struct btrfs_inode_ref *iref;
 
 	if (bytes_left >= 0)
 		dest[bytes_left] = '\0';
 
 	path->leave_spinning = 1;
 	while (1) {
-		len = btrfs_inode_ref_name_len(eb, iref);
-		bytes_left -= len;
+		bytes_left -= name_len;
 		if (bytes_left >= 0)
 			read_extent_buffer(eb, dest + bytes_left,
-						(unsigned long)(iref + 1), len);
+					   name_off, name_len);
 		if (eb != eb_in) {
 			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
@@ -1224,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			ret = -ENOENT;
 		if (ret)
 			break;
+
 		next_inum = found_key.offset;
 
 		/* regular exit ahead */
@@ -1239,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		}
 		btrfs_release_path(path);
-
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		name_len = btrfs_inode_ref_name_len(eb, iref);
+		name_off = (unsigned long)(iref + 1);
+
 		parent = next_inum;
 		--bytes_left;
 		if (bytes_left >= 0)
@@ -1257,6 +1247,32 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 }
 
 /*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+			 struct btrfs_path *path,
+			 struct btrfs_inode_ref *iref,
+			 struct extent_buffer *eb_in, u64 parent,
+			 char *dest, u32 size)
+{
+	return ref_to_path(fs_root, path,
+			   btrfs_inode_ref_name_len(eb_in, iref),
+			   (unsigned long)(iref + 1),
+			   eb_in, parent, dest, size);
+}
+
+/*
  * this makes the path point to (logical EXTENT_ITEM *)
  * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
  * tree blocks and <0 on error.
@@ -1529,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
-				struct btrfs_path *path,
-				iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+			      struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+			      struct btrfs_path *path,
+			      iterate_irefs_t *iterate, void *ctx)
 {
 	int ret = 0;
 	int slot;
@@ -1548,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	while (!ret) {
 		path->leave_spinning = 1;
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-					&found_key);
+				     &found_key);
 		if (ret < 0)
 			break;
 		if (ret) {
@@ -1576,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 				 "tree %llu\n", cur,
 				 (unsigned long long)found_key.objectid,
 				 (unsigned long long)fs_root->objectid);
-			ret = iterate(parent, iref, eb, ctx);
+			ret = iterate(parent, name_len,
+				      (unsigned long)(iref + 1), eb, ctx);
 			if (ret)
 				break;
 			len = sizeof(*iref) + name_len;
@@ -1591,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	return ret;
 }
 
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+				 struct btrfs_path *path,
+				 iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u64 offset = 0;
+	u64 parent;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	u32 cur_offset;
+	unsigned long ptr;
+
+	while (1) {
+		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+					    &offset);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		atomic_inc(&eb->refs);
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			u32 name_len;
+
+			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+			parent = btrfs_inode_extref_parent(eb, extref);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			ret = iterate(parent, name_len,
+				      (unsigned long)&extref->name, eb, ctx);
+			if (ret)
+				break;
+
+			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += sizeof(*extref);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+
+		offset++;
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+			 struct btrfs_path *path, iterate_irefs_t *iterate,
+			 void *ctx)
+{
+	int ret;
+	int found_refs = 0;
+
+	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+	if (!ret)
+		++found_refs;
+	else if (ret != -ENOENT)
+		return ret;
+
+	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+	if (ret == -ENOENT && found_refs)
+		return 0;
+
+	return ret;
+}
+
 /*
  * returns 0 if the path could be dumped (probably truncated)
  * returns <0 in case of an error
  */
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+			 struct extent_buffer *eb, void *ctx)
 {
 	struct inode_fs_paths *ipath = ctx;
 	char *fspath;
@@ -1609,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 					ipath->fspath->bytes_left - s_ptr : 0;
 
 	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-	fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
-				inum, fspath_min, bytes_left);
+	fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+			     name_off, eb, inum, fspath_min,
+			     bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		pr_debug("path resolved: %s\n", fspath);
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
-		pr_debug("missed path, not enough space. missing bytes: %lu, "
-			 "constructed so far: %s\n",
-			 (unsigned long)(fspath_min - fspath), fspath_min);
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
@@ -1644,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
 	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
-				inode_to_path, ipath);
+			     inode_to_path, ipath);
 }
 
 struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0b920c1..e755330 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,8 +33,6 @@ struct inode_fs_paths {
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
 		void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx);
 
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path);
-- 
cgit v0.10.2


From e8830e606ffee383f073e32313f11fc5692813fe Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Sep 2012 22:14:29 -0600
Subject: Btrfs: fix memory leak in start_transaction()

This patch fixes memory leak of the transaction handle which happened
when starting transaction failed on a freezed fs.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 910ff80..3548964 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -351,8 +351,10 @@ again:
 	 */
 	if (type != TRANS_JOIN_NOLOCK &&
 	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
-		if (type == TRANS_JOIN_FREEZE)
+		if (type == TRANS_JOIN_FREEZE) {
+			kmem_cache_free(btrfs_trans_handle_cachep, h);
 			return ERR_PTR(-EPERM);
+		}
 		sb_start_intwrite(root->fs_info->sb);
 	}
 
-- 
cgit v0.10.2


From a698d0755adb6f27289d1e6610b2240595d27e8c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Sep 2012 01:51:59 -0600
Subject: Btrfs: add a type field for the transaction handle

This patch add a type field into the transaction handle structure,
in this way, we needn't implement various end-transaction functions
and can make the code more simple and readable.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 596305e..e355eb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1363,11 +1363,7 @@ out_check:
 	}
 
 error:
-	if (nolock) {
-		err = btrfs_end_transaction_nolock(trans, root);
-	} else {
-		err = btrfs_end_transaction(trans, root);
-	}
+	err = btrfs_end_transaction(trans, root);
 	if (!ret)
 		ret = err;
 
@@ -1957,12 +1953,8 @@ out_unlock:
 out:
 	if (root != root->fs_info->tree_root)
 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (trans) {
-		if (nolock)
-			btrfs_end_transaction_nolock(trans, root);
-		else
-			btrfs_end_transaction(trans, root);
-	}
+	if (trans)
+		btrfs_end_transaction(trans, root);
 
 	if (ret)
 		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -4524,10 +4516,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-		if (nolock)
-			ret = btrfs_end_transaction_nolock(trans, root);
-		else
-			ret = btrfs_commit_transaction(trans, root);
+		ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3548964..d0a2b7e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -267,14 +267,6 @@ static void wait_current_trans(struct btrfs_root *root)
 	}
 }
 
-enum btrfs_trans_type {
-	TRANS_START,
-	TRANS_JOIN,
-	TRANS_USERSPACE,
-	TRANS_JOIN_NOLOCK,
-	TRANS_JOIN_FREEZE,
-};
-
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
 	if (root->fs_info->log_root_recovering)
@@ -388,6 +380,7 @@ again:
 	h->aborted = 0;
 	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
+	h->type = type;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
@@ -540,11 +533,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle, int lock)
+			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
 	int count = 0;
+	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
 
 	if (--trans->use_count) {
@@ -645,7 +639,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_end_transaction(trans, root, 0, 1);
+	ret = __btrfs_end_transaction(trans, root, 0);
 	if (ret)
 		return ret;
 	return 0;
@@ -656,18 +650,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_end_transaction(trans, root, 1, 1);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root)
-{
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 0, 0);
+	ret = __btrfs_end_transaction(trans, root, 1);
 	if (ret)
 		return ret;
 	return 0;
@@ -676,7 +659,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1, 1);
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 /*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fbf8313..0630bd1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
 	int aborted;
 };
 
+enum btrfs_trans_type {
+	TRANS_START,
+	TRANS_JOIN,
+	TRANS_USERSPACE,
+	TRANS_JOIN_NOLOCK,
+	TRANS_JOIN_FREEZE,
+};
+
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
-	int aborted;
-	int adding_csums;
+	short aborted;
+	short adding_csums;
+	enum btrfs_trans_type type;
 	/*
 	 * this root is only needed to validate that the root passed to
 	 * start_transaction is the same as the one passed to end_transaction.
@@ -94,8 +103,6 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_noflush(
-- 
cgit v0.10.2


From 354aa0fb6d5b97b262e056f7ad7bfc88d7ce0004 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Sep 2012 01:54:00 -0600
Subject: Btrfs: fix orphan transaction on the freezed filesystem

With the following debug patch:

 static int btrfs_freeze(struct super_block *sb)
 {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_transaction *trans;
+
+	spin_lock(&fs_info->trans_lock);
+	trans = fs_info->running_transaction;
+	if (trans) {
+		printk("Transid %llu, use_count %d, num_writer %d\n",
+			trans->transid, atomic_read(&trans->use_count),
+			atomic_read(&trans->num_writers));
+	}
+	spin_unlock(&fs_info->trans_lock);
 	return 0;
 }

I found there was a orphan transaction after the freeze operation was done.

It is because the transaction may not be committed when the transaction handle
end even though it is the last handle of the current transaction. This design
avoid committing the transaction frequently, but also introduce the above
problem.

So I add btrfs_attach_transaction() which can catch the current transaction
and commit it. If there is no transaction, it will return ENOENT, and do not
anything.

This function also can be used to instead of btrfs_join_transaction_freeze()
because it don't increase the writer counter and don't start a new transaction,
so it also can fix the deadlock between sync and freeze.

Besides that, it is used to instead of btrfs_join_transaction() in
transaction_kthread(), because if there is no transaction, the transaction
kthread needn't anything.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2c18a4e..dcaf556 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1657,9 +1657,10 @@ static int transaction_kthread(void *arg)
 		spin_unlock(&root->fs_info->trans_lock);
 
 		/* If the file system is aborted, this will always fail. */
-		trans = btrfs_join_transaction(root);
+		trans = btrfs_attach_transaction(root);
 		if (IS_ERR(trans)) {
-			cannot_commit = true;
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
 			goto sleep;
 		}
 		if (transid == trans->transid) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fb260bb..f8b803f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -854,10 +854,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0);
 
-	trans = btrfs_join_transaction_freeze(root);
+	trans = btrfs_attach_transaction(root);
 	if (IS_ERR(trans)) {
-		/* Frozen, don't bother */
-		if (PTR_ERR(trans) == -EPERM)
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
 			return 0;
 		return PTR_ERR(trans);
 	}
@@ -1511,7 +1511,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	return 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d0a2b7e..69139a3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
 	}
 
 	if (fs_info->trans_no_join) {
-		if (!nofail) {
+		/* 
+		 * If we are JOIN_NOLOCK we're already committing a current
+		 * transaction, we just need a handle to deal with something
+		 * when committing the transaction, such as inode cache and
+		 * space cache. It is a special case.
+		 */
+		if (type != TRANS_JOIN_NOLOCK) {
 			spin_unlock(&fs_info->trans_lock);
 			return -EBUSY;
 		}
@@ -87,6 +93,13 @@ loop:
 	}
 	spin_unlock(&fs_info->trans_lock);
 
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
@@ -340,27 +353,28 @@ again:
 	 * because we're already holding a ref.  We need this because we could
 	 * have raced in and did an fsync() on a file which can kick a commit
 	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 	 */
-	if (type != TRANS_JOIN_NOLOCK &&
-	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
-		if (type == TRANS_JOIN_FREEZE) {
-			kmem_cache_free(btrfs_trans_handle_cachep, h);
-			return ERR_PTR(-EPERM);
-		}
+	if (type < TRANS_JOIN_NOLOCK)
 		sb_start_intwrite(root->fs_info->sb);
-	}
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
 	do {
-		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+		ret = join_transaction(root, type);
 		if (ret == -EBUSY)
 			wait_current_trans(root);
 	} while (ret == -EBUSY);
 
 	if (ret < 0) {
-		sb_end_intwrite(root->fs_info->sb);
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+		if (type < TRANS_JOIN_NOLOCK)
+			sb_end_intwrite(root->fs_info->sb);
 		kmem_cache_free(btrfs_trans_handle_cachep, h);
 		return ERR_PTR(ret);
 	}
@@ -432,9 +446,9 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0);
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -605,7 +619,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	if (lock)
+	if (trans->type < TRANS_JOIN_NOLOCK)
 		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
@@ -1678,7 +1692,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	sb_end_intwrite(root->fs_info->sb);
+	if (trans->type < TRANS_JOIN_NOLOCK)
+		sb_end_intwrite(root->fs_info->sb);
 
 	trace_btrfs_transaction_commit(root);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0630bd1..8096194 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -52,7 +52,7 @@ enum btrfs_trans_type {
 	TRANS_JOIN,
 	TRANS_USERSPACE,
 	TRANS_JOIN_NOLOCK,
-	TRANS_JOIN_FREEZE,
+	TRANS_ATTACH,
 };
 
 struct btrfs_trans_handle {
@@ -109,7 +109,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v0.10.2


From 67b0fd63d5466bb710677afd3d2b77926c55f662 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 24 Sep 2012 13:42:00 -0400
Subject: Btrfs: run delayed refs first when out of space

Running delayed refs is faster than running delalloc, so lets do that first
to try and reclaim space.  This makes my fs_mark test about 20% faster.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 395e222..3270b10 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3817,10 +3817,10 @@ commit:
 }
 
 enum flush_state {
-	FLUSH_DELALLOC		=	1,
-	FLUSH_DELALLOC_WAIT	=	2,
-	FLUSH_DELAYED_ITEMS_NR	=	3,
-	FLUSH_DELAYED_ITEMS	=	4,
+	FLUSH_DELAYED_ITEMS_NR	=	1,
+	FLUSH_DELAYED_ITEMS	=	2,
+	FLUSH_DELALLOC		=	3,
+	FLUSH_DELALLOC_WAIT	=	4,
 	ALLOC_CHUNK		=	5,
 	COMMIT_TRANS		=	6,
 };
@@ -3834,11 +3834,6 @@ static int flush_space(struct btrfs_root *root,
 	int ret = 0;
 
 	switch (state) {
-	case FLUSH_DELALLOC:
-	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(root, num_bytes, orig_bytes,
-				state == FLUSH_DELALLOC_WAIT);
-		break;
 	case FLUSH_DELAYED_ITEMS_NR:
 	case FLUSH_DELAYED_ITEMS:
 		if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3859,6 +3854,11 @@ static int flush_space(struct btrfs_root *root,
 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
 		btrfs_end_transaction(trans, root);
 		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(root, num_bytes, orig_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
 	case ALLOC_CHUNK:
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
@@ -3903,7 +3903,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 used;
 	u64 num_bytes = orig_bytes;
-	int flush_state = FLUSH_DELALLOC;
+	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
 	bool flushing = false;
 	bool committed = false;
-- 
cgit v0.10.2


From 221b831835421f9451182611fa25fa60f440662f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@zabbo.net>
Date: Thu, 20 Sep 2012 14:33:00 -0600
Subject: btrfs: fix min csum item size warnings in 32bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7ca4be45a0255ac8f08c05491c6add2dd87dd4f8 limited csum items to
PAGE_CACHE_SIZE.  It used min() with incompatible types in 32bit which
generates warnings:

fs/btrfs/file-item.c: In function ‘btrfs_csum_file_blocks’:
fs/btrfs/file-item.c:717: warning: comparison of distinct pointer types lacks a cast

This uses min_t(u32,) to fix the warnings.  u32 seemed reasonable
because btrfs_root->leafsize is u32 and PAGE_CACHE_SIZE is unsigned
long.

Signed-off-by: Zach Brown <zab@zabbo.net>

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54f0335..1ad08e4e4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -29,7 +29,8 @@
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+				       PAGE_CACHE_SIZE))
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \
-- 
cgit v0.10.2


From de0022b9da616b95ea5b41eab32da825b0b5150f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 14:25:58 -0400
Subject: Btrfs: do not async metadata csumming in certain situations

There are a coule scenarios where farming metadata csumming off to an async
thread doesn't help.  The first is if our processor supports crc32c, in
which case the csumming will be fast and so the overhead of the async model
is not worth the cost.  The other case is for our tree log.  We will be
making that stuff dirty and writing it out and waiting for it immediately.
Even with software crc32c this gives me a ~15% increase in speed with O_SYNC
workloads.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcaf556..aa02eab 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -859,10 +863,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int async = check_async_write(inode, bio_flags);
 	int ret;
 
 	if (!(rw & REQ_WRITE)) {
@@ -877,6 +893,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			return ret;
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			return ret;
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
 	}
 
 	/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2c2157..979fa0d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -3163,12 +3164,16 @@ static int write_one_eb(struct extent_buffer *eb,
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
 	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
 
@@ -3177,7 +3182,8 @@ static int write_one_eb(struct extent_buffer *eb,
 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
-					 0, 0, 0);
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
 		if (ret) {
 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 			SetPageError(p);
@@ -3212,6 +3218,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 		.tree = tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	int ret = 0;
 	int done = 0;
@@ -3474,7 +3481,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		if (epd->sync_io)
 			rw = WRITE_SYNC;
 
-		ret = submit_one_bio(rw, epd->bio, 0, 0);
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
@@ -3497,6 +3504,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
@@ -3521,6 +3529,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
@@ -3560,6 +3569,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 512f8da..a69dea2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
-- 
cgit v0.10.2


From ce1953325662fa597197ce728e4195582fc21c8d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 15:26:16 -0400
Subject: Btrfs: do not hold the file extent leaf locked when adding extent
 item

For some reason we unlock everything except the leaf we are on, set the path
blocking and then add the extent item for the extent we just finished
writing.  I can't for the life of me figure out why we would want to do
this, and the history doesn't really indicate that there was a real reason
for it, so just remove it.  This will reduce our tree lock contention on
heavy writes.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e355eb0..d24b650 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1822,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
 
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_set_lock_blocking(leaf);
-
 	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
 
 	inode_add_bytes(inode, num_bytes);
 
-- 
cgit v0.10.2


From e6138876ad8327250d77291b3262fee356267211 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 27 Sep 2012 17:07:30 -0400
Subject: Btrfs: cache extent state when writing out dirty metadata pages

Everytime we write out dirty pages we search for an offset in the tree,
convert the bits in the state, and then when we wait we search for the
offset again and clear the bits.  So for every dirty range in the io tree we
are doing 4 rb searches, which is suboptimal.  With this patch we are only
doing 2 searches for every cycle (modulo weird things happening).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa02eab..c699955 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3572,7 +3572,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
+					    mark, NULL);
 		if (ret)
 			break;
 
@@ -3627,7 +3627,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 again:
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3270b10..ca4aad9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE);
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
 		if (ret)
 			break;
 
@@ -5045,7 +5046,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 979fa0d..e8ee39b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -937,6 +937,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * @end:	the end offset in bytes (inclusive)
  * @bits:	the bits to set in this range
  * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
  * @mask:	the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
@@ -946,7 +947,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask)
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -963,6 +965,15 @@ again:
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -993,6 +1004,7 @@ hit_next:
 	 */
 	if (state->start == start && state->end <= end) {
 		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
 		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
 			goto out;
@@ -1033,6 +1045,7 @@ hit_next:
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
 			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
 				goto out;
@@ -1071,6 +1084,7 @@ hit_next:
 				   &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
@@ -1093,6 +1107,7 @@ hit_next:
 			extent_io_tree_panic(tree, err);
 
 		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
 		goto out;
@@ -1297,18 +1312,42 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
+	struct rb_node *n;
 	int ret = 1;
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && state->tree) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
+
 	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
 	if (state) {
+		cache_state(state, cached_state);
 		*start_ret = state->start;
 		*end_ret = state->end;
 		ret = 0;
 	}
+out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a69dea2..7aeb310 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -233,13 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask);
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
 		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b107e68..1027b85 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			       block_group->key.offset)) {
 		ret = find_first_extent_bit(unpin, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret) {
 			ret = 0;
 			break;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6e530bb..776f0aa 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3621,7 +3621,7 @@ next:
 
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 
 		if (ret == 0 && start <= key.objectid) {
 			btrfs_release_path(path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 69139a3..77db875 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      mark)) {
-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-				   GFP_NOFS);
+				      mark, &cached_state)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				   mark, &cached_state, GFP_NOFS);
+		cached_state = NULL;
 		err = filemap_fdatawrite_range(mapping, start, end);
 		if (err)
 			werr = err;
@@ -717,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      EXTENT_NEED_WAIT)) {
-		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				 0, 0, &cached_state, GFP_NOFS);
 		err = filemap_fdatawait_range(mapping, start, end);
 		if (err)
 			werr = err;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d7b348..f4b9e54 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2463,7 +2463,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
 		if (ret)
 			break;
 
-- 
cgit v0.10.2


From 6f1fed775316d58ba356b2ce62de600ad00f003a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 26 Sep 2012 11:07:06 -0400
Subject: Btrfs: don't lookup csums for prealloc extents

The tree logging stuff was looking up csums to copy over for prealloc
extents which is just work we don't need to be doing.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4b9e54..31b46a9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3011,8 +3011,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				continue;
 
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				u64 ds, dl, cs, cl;
 				ds = btrfs_file_extent_disk_bytenr(src,
 								extent);
-- 
cgit v0.10.2


From 18ec90d63f43cf785f6e73e7a7db546bff3fa380 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 28 Sep 2012 11:56:28 -0400
Subject: Btrfs: be smarter about dropping things from the tree log

When we truncate existing items in the tree log we've been searching for
each individual item and removing them.  This is unnecessary churn and
searching, just keep track of the slot we are on and how many items we need
to delete and delete them all at once.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 31b46a9..179fda9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2901,6 +2901,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	int start_slot;
 
 	key.objectid = objectid;
 	key.type = max_key_type;
@@ -2922,8 +2923,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 		if (found_key.objectid != objectid)
 			break;
 
-		ret = btrfs_del_item(trans, log, path);
-		if (ret)
+		found_key.offset = 0;
+		found_key.type = 0;
+		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+				       &start_slot);
+
+		ret = btrfs_del_items(trans, log, path, start_slot,
+				      path->slots[0] - start_slot + 1);
+		/*
+		 * If start slot isn't 0 then we don't need to re-search, we've
+		 * found the last guy with the objectid in this tree.
+		 */
+		if (ret || start_slot != 0)
 			break;
 		btrfs_release_path(path);
 	}
-- 
cgit v0.10.2


From f0bd95ea72644cbf9210606f6f8edfce13d65d47 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 1 Oct 2012 03:08:37 -0600
Subject: Btrfs: confirmation of value is added before trace_btrfs_get_extent()
 is called

We should confirm the value of extent_map before calling
trace_btrfs_get_extent() because the value of extent_map has the
possibility of NULL.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d24b650..f92def2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5505,7 +5505,8 @@ insert:
 	write_unlock(&em_tree->lock);
 out:
 
-	trace_btrfs_get_extent(root, em);
+	if (em)
+		trace_btrfs_get_extent(root, em);
 
 	if (path)
 		btrfs_free_path(path);
-- 
cgit v0.10.2


From 44734ed1ca0c2e66479cedbf773627a9b5a31364 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 28 Sep 2012 16:04:19 -0400
Subject: Btrfs: don't commit instead of overcommitting

I don't think we have the same problem that this was supposed to fix
originally since we can allocate chunks in the enospc path now.  This code
is causing us to constantly commit the transaction as we get close to using
all of our available space in our currently allocated chunks, instead of
allocating another chunk and carrying on with life, which is not nice for
performance.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ca4aad9..3d3e2c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3907,7 +3907,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
 	bool flushing = false;
-	bool committed = false;
 
 again:
 	ret = 0;
@@ -3970,33 +3969,12 @@ again:
 			(orig_bytes * 2);
 	}
 
-	if (ret) {
-		u64 avail;
-
-		/*
-		 * If we have a lot of space that's pinned, don't bother doing
-		 * the overcommit dance yet and just commit the transaction.
-		 */
-		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !committed) {
-			space_info->flush = 1;
-			flushing = true;
-			spin_unlock(&space_info->lock);
-			ret = may_commit_transaction(root, space_info,
-						     orig_bytes, 1);
-			if (ret)
-				goto out;
-			committed = true;
-			goto again;
-		}
-
-		if (can_overcommit(root, space_info, orig_bytes, flush)) {
-			space_info->bytes_may_use += orig_bytes;
-			trace_btrfs_space_reservation(root->fs_info,
-				"space_info", space_info->flags, orig_bytes, 1);
-			ret = 0;
-		}
+	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+		space_info->bytes_may_use += orig_bytes;
+		trace_btrfs_space_reservation(root->fs_info, "space_info",
+					      space_info->flags, orig_bytes,
+					      1);
+		ret = 0;
 	}
 
 	/*
-- 
cgit v0.10.2


From 8d1a1317af439cc7f349907b79198d35feffe179 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sat, 29 Sep 2012 02:07:46 -0600
Subject: btrfs: remove unused function btrfs_insert_some_items()

The function btrfs_insert_some_items() would not be called by any other functions,
so remove it.

Signed-off-by: Robin Dong <sanbai@taobao.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fcc8c21..b334362 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
 }
 
 /*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int i;
-	u32 nritems;
-	u32 total_data = 0;
-	u32 total_size = 0;
-	unsigned int data_end;
-	struct btrfs_disk_key disk_key;
-	struct btrfs_key found_key;
-	struct btrfs_map_token token;
-
-	btrfs_init_map_token(&token);
-
-	for (i = 0; i < nr; i++) {
-		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-		    BTRFS_LEAF_DATA_SIZE(root)) {
-			break;
-			nr = i;
-		}
-		total_data += data_size[i];
-		total_size += data_size[i] + sizeof(struct btrfs_item);
-	}
-	BUG_ON(nr == 0);
-
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-
-	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(root, leaf);
-
-	if (btrfs_leaf_free_space(root, leaf) < total_size) {
-		for (i = nr; i >= 0; i--) {
-			total_data -= data_size[i];
-			total_size -= data_size[i] + sizeof(struct btrfs_item);
-			if (total_size < btrfs_leaf_free_space(root, leaf))
-				break;
-		}
-		nr = i;
-	}
-
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
-	if (slot != nritems) {
-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
-		item = btrfs_item_nr(leaf, slot);
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		/* figure out how many keys we can insert in here */
-		total_data = data_size[0];
-		for (i = 1; i < nr; i++) {
-			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-				break;
-			total_data += data_size[i];
-		}
-		nr = i;
-
-		if (old_data < data_end) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-			       slot, old_data, data_end);
-			BUG_ON(1);
-		}
-		/*
-		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
-		 */
-		/* first correct the data pointers */
-		for (i = slot; i < nritems; i++) {
-			u32 ioff;
-
-			item = btrfs_item_nr(leaf, i);
-			ioff = btrfs_token_item_offset(leaf, item, &token);
-			btrfs_set_token_item_offset(leaf, item,
-						    ioff - total_data, &token);
-		}
-		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
-
-		/* shift the data */
-		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-			      data_end - total_data, btrfs_leaf_data(leaf) +
-			      data_end, old_data - data_end);
-		data_end = old_data;
-	} else {
-		/*
-		 * this sucks but it has to be done, if we are inserting at
-		 * the end of the leaf only insert 1 of the items, since we
-		 * have no way of knowing whats on the next leaf and we'd have
-		 * to drop our current locks to figure it out
-		 */
-		nr = 1;
-	}
-
-	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_token_item_offset(leaf, item,
-					    data_end - data_size[i], &token);
-		data_end -= data_size[i];
-		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
-	}
-	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
-
-	ret = 0;
-	if (slot == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		fixup_low_keys(trans, root, path, &disk_key, 1);
-	}
-
-	if (btrfs_leaf_free_space(root, leaf) < 0) {
-		btrfs_print_leaf(root, leaf);
-		BUG();
-	}
-out:
-	if (!ret)
-		ret = nr;
-	return ret;
-}
-
-/*
  * this is a helper for btrfs_insert_empty_items, the main goal here is
  * to save stack depth by doing the bulk of the work in a function
  * that doesn't call btrfs_search_slot
-- 
cgit v0.10.2


From 7a2d6a64645b38d7040bbd031c7a7b2655f5d976 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 1 Oct 2012 03:07:15 -0600
Subject: Btrfs: remove unnecessary IS_ERR in bio_readpage_error()

Because the value of extent_map is only a correct value or NULL,
so IS_ERR is unnecessary.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e8ee39b..67fe401 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2110,7 +2110,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		}
 		read_unlock(&em_tree->lock);
 
-		if (!em || IS_ERR(em)) {
+		if (!em) {
 			kfree(failrec);
 			return -EIO;
 		}
-- 
cgit v0.10.2


From 479ed9abdbeec5d9ed0005f3bee9c9bc06a102bb Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sat, 29 Sep 2012 02:07:47 -0600
Subject: btrfs: move inline function code to header file

When building btrfs from kernel code, it will report:

	fs/btrfs/extent_io.h:281: warning: 'extent_buffer_page' declared inline after being called
	fs/btrfs/extent_io.h:281: warning: previous declaration of 'extent_buffer_page' was here
	fs/btrfs/extent_io.h:280: warning: 'num_extent_pages' declared inline after being called
	fs/btrfs/extent_io.h:280: warning: previous declaration of 'num_extent_pages' was here

because of the wrong declaration of inline functions.

Signed-off-by: Robin Dong <sanbai@taobao.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 67fe401..b82d244 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3986,18 +3986,6 @@ out:
 	return ret;
 }
 
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
-{
-	return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
 #if LEAK_DEBUG
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7aeb310..711d12b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -282,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	return eb->pages[i];
+}
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
-- 
cgit v0.10.2


From 94edf4ae43a5f9405fe3570f670e26ce1b188476 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 14:56:25 -0400
Subject: Btrfs: don't bother committing delayed inode updates when fsyncing

We can just copy the in memory inode into the tree log directly, no sense in
updating the fs tree so we can copy it into the tree log tree.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 179fda9..47911fd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2944,6 +2944,55 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode, int log_inode_only)
+{
+	btrfs_set_inode_uid(leaf, item, inode->i_uid);
+	btrfs_set_inode_gid(leaf, item, inode->i_gid);
+	btrfs_set_inode_mode(leaf, item, inode->i_mode);
+	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+			       inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+				inode->i_atime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+			       inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				inode->i_mtime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+			       inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				inode->i_ctime.tv_nsec);
+
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+
+	btrfs_set_inode_sequence(leaf, item, inode->i_version);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
+	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+	btrfs_set_inode_block_group(leaf, item, 0);
+
+	if (log_inode_only) {
+		/* set the generation to zero so the recover code
+		 * can tell the difference between an logging
+		 * just to say 'this inode exists' and a logging
+		 * to say 'update this inode with these values'
+		 */
+		btrfs_set_inode_generation(leaf, item, 0);
+		btrfs_set_inode_size(leaf, item, 0);
+	} else {
+		btrfs_set_inode_generation(leaf, item,
+					   BTRFS_I(inode)->generation);
+		btrfs_set_inode_size(leaf, item, inode->i_size);
+	}
+
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode,
 			       struct btrfs_path *dst_path,
@@ -2990,24 +3039,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, ins_sizes[i]);
-
-		if (inode_only == LOG_INODE_EXISTS &&
-		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
 						    dst_path->slots[0],
 						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
+			fill_inode_item(trans, dst_path->nodes[0], inode_item,
+					inode, inode_only == LOG_INODE_EXISTS);
+		} else {
+			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+					   src_offset, ins_sizes[i]);
 		}
+
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
@@ -3361,11 +3403,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	ret = btrfs_commit_inode_delayed_items(trans, inode);
-	if (ret) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		return ret;
+	/* Only run delayed items if we are a dir or a new file */
+	if (S_ISDIR(inode->i_mode) ||
+	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+		ret = btrfs_commit_inode_delayed_items(trans, inode);
+		if (ret) {
+			btrfs_free_path(path);
+			btrfs_free_path(dst_path);
+			return ret;
+		}
 	}
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
-- 
cgit v0.10.2


From 489406626c42c0176fddae2182d33be2cfb9840c Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Mon, 7 May 2012 06:35:38 -0600
Subject: btrfs: fix message printing

Fix various messages to include newline and module prefix.

Signed-off-by: Daniel J Blueman <daniel@quora.org>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8b803f..0fadcdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,7 +243,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
 {
-	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
 	trans->aborted = errno;
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
@@ -549,11 +549,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
 		case Opt_defrag:
-			printk(KERN_INFO "btrfs: enabling auto defrag");
+			printk(KERN_INFO "btrfs: enabling auto defrag\n");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
 		case Opt_recovery:
-			printk(KERN_INFO "btrfs: enabling auto recovery");
+			printk(KERN_INFO "btrfs: enabling auto recovery\n");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
 		case Opt_skip_balance:
@@ -1602,7 +1602,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "misc_deregister failed for control device");
+		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4f4de51..dfe5e3a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = blkdev_get_by_path(device->name->str, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk(KERN_INFO "open %s failed\n", device->name->str);
+			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
 			goto error;
 		}
 		filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -3769,7 +3769,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
 		       (unsigned long long)logical,
 		       (unsigned long long)*length);
 		BUG();
@@ -4226,7 +4226,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = bbio->num_stripes;
 	if (map_length < length) {
-		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
 		       "len %llu\n", (unsigned long long)logical,
 		       (unsigned long long)length,
 		       (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c2065..9acb846 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	*total_in = 0;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-		printk(KERN_WARNING "deflateInit failed\n");
+		printk(KERN_WARNING "btrfs: deflateInit failed\n");
 		ret = -1;
 		goto out;
 	}
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
 			       ret);
 			zlib_deflateEnd(&workspace->def_strm);
 			ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 
-- 
cgit v0.10.2


From bedb2cca7252d08c6ca3085826e30f65bdc3b54b Mon Sep 17 00:00:00 2001
From: Andrei Popa <andrei.popa@i-neo.ro>
Date: Thu, 20 Sep 2012 08:42:11 -0600
Subject: Btrfs: make compress and nodatacow mount options mutually exclusive

If a filesystem is mounted with compression and then remounted by adding nodatacow,
the compression is disabled but the compress flag is still visible.
Also, if a filesystem is mounted with nodatacow and then remounted with compression,
nodatacow flag is still present but it's not active.
This patch:
- removes compress flags and notifies that the compression has been disabled if the
  filesystem is mounted with nodatacow
- removes nodatacow and nodatasum flags if mounted with compress.

Signed-off-by: Andrei Popa <andrei.popa@i-neo.ro>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0fadcdb..915ac14 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -413,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			if (!btrfs_test_opt(root, COMPRESS) ||
+				!btrfs_test_opt(root, FORCE_COMPRESS)) {
+					printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+			} else {
+				printk(KERN_INFO "btrfs: setting nodatacow\n");
+			}
+			info->compress_type = BTRFS_COMPRESS_NONE;
+			btrfs_clear_opt(info->mount_opt, COMPRESS);
+			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
@@ -428,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
 				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
 				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
-- 
cgit v0.10.2


From 62856a9b73860cffe2a3d91b069393b88c219aa6 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 31 Jul 2012 11:09:44 -0600
Subject: Btrfs: detect corrupted filesystem after write I/O errors

In check-integrity, detect when a superblock is written that points
to blocks that have not been written to disk due to I/O write errors.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e..5a3e45d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
  *        the file system was mounted, (i.e., they have been
  *        referenced by the super block) or they have been
  *        written since then and the write completion callback
- *        was called and a FLUSH request to the device where
- *        these blocks are located was received and completed.
+ *        was called and no write error was indicated and a
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
  *    2b. All referenced blocks need to have a generation
  *        number which is equal to the parent's number.
  *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       (unsigned long long)l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 			ret = -1;
+		} else if (l->block_ref_to->iodone_w_error) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which has write error!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
 		} else if (l->parent_generation !=
 			   l->block_ref_to->generation &&
 			   BTRFSIC_GENERATION_UNKNOWN !=
-- 
cgit v0.10.2


From 5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 1 Aug 2012 18:56:49 +0200
Subject: Btrfs: make filesystem read-only when submitting barrier fails

So far the return code of barrier_all_devices() is ignored, which
means that errors are ignored. The result can be a corrupt
filesystem which is not consistent.
This commit adds code to evaluate the return code of
barrier_all_devices(). The normal btrfs_error() mechanism is used to
switch the filesystem into read-only mode when errors are detected.

In order to decide whether barrier_all_devices() should return
error or success, the number of disks that are allowed to fail the
barrier submission is calculated. This calculation accounts for the
worst RAID level of metadata, system and data. If single, dup or
RAID0 is in use, a single disk error is already considered to be
fatal. Otherwise a single disk error is tolerated.

The calculation of the number of disks that are tolerated to fail
the barrier operation is performed when the filesystem gets mounted,
when a balance operation is started and finished, and when devices
are added or removed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50dcd0f..1630be8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1468,6 +1468,8 @@ struct btrfs_fs_info {
 
 	/* next backup root to be overwritten */
 	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
 };
 
 /*
@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+
 /* file.c */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c699955..8355236 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2505,6 +2505,8 @@ retry_root_backup:
 		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
 		goto fail_block_groups;
 	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 			printk_in_rcu("btrfs: disabling barriers on dev %s\n",
 				      rcu_str_deref(device->name));
 			device->nobarriers = 1;
-		}
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
-			if (!bio_flagged(bio, BIO_EOPNOTSUPP))
-				btrfs_dev_stat_inc_and_print(device,
-					BTRFS_DEV_STAT_FLUSH_ERRS);
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 
 		/* drop the reference from the wait == 0 run */
@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
-	int errors = 0;
+	int errors_send = 0;
+	int errors_wait = 0;
 	int ret;
 
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_send++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 0);
 		if (ret)
-			errors++;
+			errors_send++;
 	}
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_wait++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 1);
 		if (ret)
-			errors++;
+			errors_wait++;
 	}
-	if (errors)
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
 		return -EIO;
 	return 0;
 }
 
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1
+					 &&
+					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+						   BTRFS_BLOCK_GROUP_RAID10)))
+					num_tolerated_disk_barrier_failures = 1;
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
 
-	if (do_barriers)
-		barrier_all_devices(root->fs_info);
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
 
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a7..2025a91 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d6836af..f5a2e6c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-				 struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				get_block_group_info(&info->block_groups[c],
-						     &space);
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 47911fd..67eab2d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	btrfs_scrub_pause_super(root);
-	write_ctree_super(trans, root->fs_info->tree_root, 1);
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	btrfs_scrub_continue_super(root);
-	ret = 0;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
 
 	mutex_lock(&root->log_mutex);
 	if (root->last_log_commit < log_transid)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dfe5e3a..029b903 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		free_fs_devices(cur_devices);
 	}
 
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
 	/*
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_clear_space_info_full(root->fs_info);
 
 	unlock_chunks(root);
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
 	ret = btrfs_commit_transaction(trans, root);
 
 	if (seeding_dev) {
@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
 	ret = insert_balance_item(fs_info->tree_root, bctl);
 	if (ret && ret != -EEXIST)
 		goto out;
@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		__cancel_balance(fs_info);
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
 	wake_up(&fs_info->balance_wait_q);
 
 	return ret;
-- 
cgit v0.10.2


From 15e3004a0eb2c4879bc666d0e6a3acd973fb226c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 13:39:50 -0400
Subject: Btrfs: cleanup pages properly when ENOMEM in compression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We were freeing non-existent pages which was causing a panic for a user who
was suffering from ENOMEM.  This patch fixes the problem.  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a..c6467aa 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_start;
 	struct extent_map *em;
 	int ret = -ENOMEM;
+	int faili = 0;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
-		if (!cb->compressed_pages[pg_index])
+		if (!cb->compressed_pages[pg_index]) {
+			faili = pg_index - 1;
+			ret = -ENOMEM;
 			goto fail2;
+		}
 	}
+	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
 	add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	return 0;
 
 fail2:
-	for (pg_index = 0; pg_index < nr_pages; pg_index++)
-		free_page((unsigned long)cb->compressed_pages[pg_index]);
+	while (faili >= 0) {
+		__free_page(cb->compressed_pages[faili]);
+		faili--;
+	}
 
 	kfree(cb->compressed_pages);
 fail1:
-- 
cgit v0.10.2


From edd33c99c4ba26ebe17c1a3d65b4aba25482ed32 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:40:32 -0400
Subject: Btrfs: don't bug on enomem in readpage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Get rid of the BUG_ON(ret == -ENOMEM) in __extent_read_full_page.  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b82d244..8c37cb6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2751,12 +2751,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
-			BUG_ON(ret == -ENOMEM);
-			nr++;
-			*bio_flags = this_bio_flag;
+			if (!ret) {
+				nr++;
+				*bio_flags = this_bio_flag;
+			}
 		}
-		if (ret)
+		if (ret) {
 			SetPageError(page);
+			unlock_extent(tree, cur, cur + iosize - 1);
+		}
 		cur = cur + iosize;
 		pg_offset += iosize;
 	}
-- 
cgit v0.10.2


From 4804b38293c020e7a2c841e86402f456c19d934d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:43:45 -0400
Subject: Btrfs: do not warn_on when we cannot alloc a page for an extent
 buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's just annoying and the user will have gotten a nice OOM killer message
so they are already fully aware they are screwed :).  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8c37cb6..7dc69b3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4249,10 +4249,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!p) {
-			WARN_ON(1);
+		if (!p)
 			goto free_eb;
-		}
 
 		spin_lock(&mapping->private_lock);
 		if (PagePrivate(p)) {
-- 
cgit v0.10.2


From f60b1b49f6f72abb8bedfd49b758773bbda043c8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:53:34 -0400
Subject: Btrfs: fix page leakage

Alloc_dummy_extent_buffer will not free the first page in the eb array if we
fail to allocate a page, fix this.  Thanks,

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7dc69b3..64dc93f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4104,7 +4104,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 
 	return eb;
 err:
-	for (i--; i > 0; i--)
+	for (i--; i >= 0; i--)
 		__free_page(eb->pages[i]);
 	__free_extent_buffer(eb);
 	return NULL;
-- 
cgit v0.10.2


From 1037a5affc266fdfe08b2dd415d309ab2778ce6a Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Mon, 8 Oct 2012 07:26:15 -0600
Subject: Btrfs: remove repeated eb->pages check in,
 disk-io.c/csum_dirty_buffer

In csum_dirty_buffer, we first get eb from page->private.
Then we check if the page is the first page of eb. Later
we check it again. Remove the repeated check here.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8355236..7cda519 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -433,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 		return 0;
 	}
-	if (eb->pages[0] != page) {
-		WARN_ON(1);
-		return 0;
-	}
 	if (!PageUptodate(page)) {
 		WARN_ON(1);
 		return 0;
-- 
cgit v0.10.2


From f46dbe3dee853f8a860f889cb2b7ff4c624f2a7a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@fusionio.com>
Date: Tue, 9 Oct 2012 11:17:20 -0400
Subject: btrfs: init ref_index to zero in add_inode_ref

Signed-off-by: Chris Mason <chris.mason@fusionio.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 67eab2d..e9ebb47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1033,7 +1033,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	int log_ref_ver = 0;
 	u64 parent_objectid;
 	u64 inode_objectid;
-	u64 ref_index;
+	u64 ref_index = 0;
 	int ref_struct_size;
 
 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-- 
cgit v0.10.2