From e67bc2b35905cb82e9ee1f485095d8c0739b68c8 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 17 Feb 2014 20:34:53 -0500
Subject: ext4: Add __init marking to init_inodecache

init_inodecache is only called by __init init_ext4_fs.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2..7a829f7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -940,7 +940,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
-- 
cgit v0.10.2


From d8558a297878f1a7af995f6801983783e1487208 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 17 Feb 2014 20:44:36 -0500
Subject: ext4: clean up error handling in swap_inode_boot_loader()

Tighten up the code to make the code easier to read and maintain.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f..0f2252e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	struct ext4_inode_info *ei_bl;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
-		err = -EINVAL;
-		goto swap_boot_out;
-	}
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+		return -EINVAL;
 
-	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
-		err = -EPERM;
-		goto swap_boot_out;
-	}
+	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
 	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
-	if (IS_ERR(inode_bl)) {
-		err = PTR_ERR(inode_bl);
-		goto swap_boot_out;
-	}
+	if (IS_ERR(inode_bl))
+		return PTR_ERR(inode_bl);
 	ei_bl = EXT4_I(inode_bl);
 
 	filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
-
 	ext4_journal_stop(handle);
-
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
 journal_err_out:
 	ext4_inode_resume_unlocked_dio(inode);
 	ext4_inode_resume_unlocked_dio(inode_bl);
-
 	unlock_two_nondirectories(inode, inode_bl);
-
 	iput(inode_bl);
-
-swap_boot_out:
 	return err;
 }
 
-- 
cgit v0.10.2


From df3a98b0865467ae8033c55ebb514debd69b4e59 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 17 Feb 2014 20:46:40 -0500
Subject: ext4: remove an unneeded check in mext_page_mkuptodate()

"err" is zero here, there is no need to check again.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503..f39a88a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
 			}
 			if (!buffer_mapped(bh)) {
 				zero_user(page, block_start, blocksize);
-				if (!err)
-					set_buffer_uptodate(bh);
+				set_buffer_uptodate(bh);
 				continue;
 			}
 		}
-- 
cgit v0.10.2


From 7747e6d028b891f3bd02d93295d80f230ba43f6a Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Mon, 17 Feb 2014 20:49:04 -0500
Subject: jbd2: mark file-local functions as static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mark functions as static in jbd2/journal.c because they are not used
outside this file.

This eliminates the following warning in jbd2/journal.c:
fs/jbd2/journal.c:125:5: warning: no previous prototype for ‘jbd2_verify_csum_type’ [-Wmissing-prototypes]
fs/jbd2/journal.c:146:5: warning: no previous prototype for ‘jbd2_superblock_csum_verify’ [-Wmissing-prototypes]
fs/jbd2/journal.c:154:6: warning: no previous prototype for ‘jbd2_superblock_csum_set’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344a..244b6f6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 #endif
 
 /* Checksumming functions */
-int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 	return cpu_to_be32(csum);
 }
 
-int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 {
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 	return sb->s_checksum == jbd2_superblock_csum(j, sb);
 }
 
-void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 {
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
-- 
cgit v0.10.2


From 024949ec8fc165bfac8eb051e537bc303adb365f Mon Sep 17 00:00:00 2001
From: Patrick Palka <patrick@parcs.ath.cx>
Date: Mon, 17 Feb 2014 20:50:59 -0500
Subject: ext4: address a benign compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When !defined(CONFIG_EXT4_DEBUG), mb_debug() should be defined as a
no_printk() statement instead of an empty statement in order to suppress
the following compiler warning:

fs/ext4/mballoc.c: In function ‘ext4_mb_cleanup_pa’:
fs/ext4/mballoc.c:2659:47: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body]
   mb_debug(1, "mballoc: %u PAs left\n", count);

Signed-off-by: Patrick Palka <patrick@parcs.ath.cx>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee..9347328 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
 		}							\
 	} while (0)
 #else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...)		no_printk(fmt, ## a)
 #endif
 
 #define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
-- 
cgit v0.10.2


From ce37c42919608e96ade3748fe23c3062a0a966c5 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Wed, 19 Feb 2014 18:52:39 -0500
Subject: ext4: fix error return from ext4_ext_handle_uninitialized_extents()

Commit 3779473246 breaks the return of error codes from
ext4_ext_handle_uninitialized_extents() in ext4_ext_map_blocks().  A
portion of the patch assigns that function's signed integer return
value to an unsigned int.  Consequently, negatively valued error codes
are lost and can be treated as a bogus allocated block count.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d5..9875fd0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4128,7 +4128,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_extent newex, *ex, *ex2;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_fsblk_t newblock = 0;
-	int free_on_err = 0, err = 0, depth;
+	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
 	unsigned int allocated_clusters = 0;
 	struct ext4_allocation_request ar;
@@ -4189,9 +4189,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			if (!ext4_ext_is_uninitialized(ex))
 				goto out;
 
-			allocated = ext4_ext_handle_uninitialized_extents(
+			ret = ext4_ext_handle_uninitialized_extents(
 				handle, inode, map, path, flags,
 				allocated, newblock);
+			if (ret < 0)
+				err = ret;
+			else
+				allocated = ret;
 			goto out3;
 		}
 	}
-- 
cgit v0.10.2


From 9a6633b1a3603ccdffec669033616f9ebb35a988 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Feb 2014 20:15:15 -0500
Subject: ext4: add ext4_es_store_pblock_status()

Avoid false positives by static code analysis tools such as sparse and
coverity caused by the fact that we set the physical block, and then
the status in the extent_status structure.  It is also more efficient
to set both of these values at once.

Addresses-Coverity-Id: #989077
Addresses-Coverity-Id: #989078
Addresses-Coverity-Id: #1080722

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff7..a900004 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 
 	newes.es_lblk = lblk;
 	newes.es_len = len;
-	ext4_es_store_pblock(&newes, pblk);
-	ext4_es_store_status(&newes, status);
+	ext4_es_store_pblock_status(&newes, pblk, status);
 	trace_ext4_es_insert_extent(inode, &newes);
 
 	ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
 
 	newes.es_lblk = lblk;
 	newes.es_len = len;
-	ext4_es_store_pblock(&newes, pblk);
-	ext4_es_store_status(&newes, status);
+	ext4_es_store_pblock_status(&newes, pblk, status);
 	trace_ext4_es_cache_extent(inode, &newes);
 
 	if (!len)
@@ -812,13 +810,13 @@ retry:
 
 			newes.es_lblk = end + 1;
 			newes.es_len = len2;
+			block = 0x7FDEADBEEF;
 			if (ext4_es_is_written(&orig_es) ||
-			    ext4_es_is_unwritten(&orig_es)) {
+			    ext4_es_is_unwritten(&orig_es))
 				block = ext4_es_pblock(&orig_es) +
 					orig_es.es_len - len2;
-				ext4_es_store_pblock(&newes, block);
-			}
-			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+			ext4_es_store_pblock_status(&newes, block,
+						    ext4_es_status(&orig_es));
 			err = __es_insert_extent(inode, &newes);
 			if (err) {
 				es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8..f1b62a4 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
 		       (es->es_pblk & ~ES_MASK));
 }
 
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+					       ext4_fsblk_t pb,
+					       unsigned int status)
+{
+	es->es_pblk = (((ext4_fsblk_t)
+			(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+		       (pb & ~ES_MASK));
+}
+
 extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
-- 
cgit v0.10.2


From 7b1b2c1b9c397dcb86293ae79aa7fb7c5446120f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Feb 2014 20:15:21 -0500
Subject: ext4: don't calculate total xattr header size unless needed

The function ext4_expand_extra_isize_ea() doesn't need the size of all
of the extended attribute headers.  So if we don't calculate it when
it is unneeded, it we can skip some undeeded memory references, and as
a bonus, we eliminate some kvetching by static code analysis tools.

Addresses-Coverity-Id: #741291

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94..185066f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -567,12 +567,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 				    size_t *min_offs, void *base, int *total)
 {
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		*total += EXT4_XATTR_LEN(last->e_name_len);
 		if (!last->e_value_block && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < *min_offs)
 				*min_offs = offs;
 		}
+		if (total)
+			*total += EXT4_XATTR_LEN(last->e_name_len);
 	}
 	return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
@@ -1228,7 +1229,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	struct ext4_xattr_block_find *bs = NULL;
 	char *buffer = NULL, *b_entry_name = NULL;
 	size_t min_offs, free;
-	int total_ino, total_blk;
+	int total_ino;
 	void *base, *start, *end;
 	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
 	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1287,7 @@ retry:
 		first = BFIRST(bh);
 		end = bh->b_data + bh->b_size;
 		min_offs = end - base;
-		free = ext4_xattr_free_space(first, &min_offs, base,
-					     &total_blk);
+		free = ext4_xattr_free_space(first, &min_offs, base, NULL);
 		if (free < new_extra_isize) {
 			if (!tried_min_extra_isize && s_min_extra_isize) {
 				tried_min_extra_isize++;
-- 
cgit v0.10.2


From ab0c00fccf81dcf1dc5db0e389294ffea53be666 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 20 Feb 2014 00:36:41 -0500
Subject: ext4: make sure ex.fe_logical is initialized

The lowest levels of mballoc set all of the fields of struct
ext4_free_extent except for fe_logical, since they are just trying to
find the requested free set of blocks, and the logical block hasn't
been set yet.  This makes some static code checkers sad.  Set it to
various different debug values, which would be useful when
debugging mballoc if these values were to ever show up due to the
parts of mballoc triyng to use ac->ac_b_ex.fe_logical before it is
properly upper layers of mballoc failing to properly set, usually by
ext4_mb_use_best_found().

Addresses-Coverity-Id: #139697
Addresses-Coverity-Id: #139698
Addresses-Coverity-Id: #139699

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c75..0d42f63 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	ext4_lock_group(ac->ac_sb, group);
 	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
 			     ac->ac_g_ex.fe_len, &ex);
+	ex.fe_logical = 0xDEADFA11; /* debug value */
 
 	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
 		ext4_fsblk_t start;
@@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 */
 			break;
 		}
-
+		ex.fe_logical = 0xDEADC0DE; /* debug value */
 		ext4_mb_measure_extent(ac, &ex, e4b);
 
 		i += ex.fe_len;
@@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
 			if (max >= sbi->s_stripe) {
 				ac->ac_found++;
+				ex.fe_logical = 0xDEADF00D; /* debug value */
 				ac->ac_b_ex = ex;
 				ext4_mb_use_best_found(ac, e4b);
 				break;
-- 
cgit v0.10.2


From e861b5e9a47bd8c6a7491a2b9f6e9a230b1b8e86 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 20 Feb 2014 12:54:05 -0500
Subject: ext4: avoid possible overflow in ext4_map_blocks()

The ext4_map_blocks() function returns the number of blocks which
satisfying the caller's request.  This number of blocks requested by
the caller is specified by an unsigned integer, but the return value
of ext4_map_blocks() is a signed integer (to accomodate error codes
per the kernel's standard error signalling convention).

Historically, overflows could never happen since mballoc() will refuse
to allocate more than 2048 blocks at a time (which is something we
should fix), and if the blocks were already allocated, the fact that
there would be some number of intervening metadata blocks pretty much
guaranteed that there could never be a contiguous region of data
blocks that was greater than 2**31 blocks.

However, this is now possible if there is a file system which is a bit
bigger than 8TB, and is created using the new mke2fs hugeblock
feature, which can create a perfectly contiguous file.  In that case,
if a userspace program attempted to call fallocate() on this already
fully allocated file, it's possible that ext4_map_blocks() could
return a number large enough that it would overflow a signed integer,
resulting in a ext4 thinking that the ext4_map_blocks() call had
failed with some strange error code.

Since ext4_map_blocks() is always free to return a smaller number of
blocks than what was requested by the caller, fix this by capping the
number of blocks that ext4_map_blocks() will ever try to map to 2**31
- 1.  In practice this should never get hit, except by someone
deliberately trying to provke the above-described bug.

Thanks to the PaX team for asking whethre this could possibly happen
in some off-line discussions about using some static code checking
technology they are developing to find bugs in kernel code.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6e39895..113458c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -514,6 +514,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
 
+	/*
+	 * ext4_map_blocks returns an int, and m_len is an unsigned int
+	 */
+	if (unlikely(map->m_len > INT_MAX))
+		map->m_len = INT_MAX;
+
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 		ext4_es_lru_add(inode);
-- 
cgit v0.10.2


From dc9ddd984df5f5611c7e2149d19be5a8721c1ac5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 20 Feb 2014 13:32:10 -0500
Subject: ext4: remove unused ac_ex_scanned

When looking at a bug report with:

> kernel: EXT4-fs: 0 scanned, 0 found

I thought wow, 0 scanned, that's odd?  But it's not odd; it's printing
a variable that is initialized to 0 and never touched again.

It's never been used since the original merge, so I don't really even
know what the original intent was, either.

If anyone knows how to hook it up, speak now via patch, otherwise just
yank it so it's not making a confusing situation more confusing in
kernel logs.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0d42f63..a888cac 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4008,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			(unsigned long)ac->ac_b_ex.fe_len,
 			(unsigned long)ac->ac_b_ex.fe_logical,
 			(int)ac->ac_criteria);
-	ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
-		 ac->ac_ex_scanned, ac->ac_found);
+	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
 	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
 	ngroups = ext4_get_groups_count(sb);
 	for (i = 0; i < ngroups; i++) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9347328..d634e18 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
 	/* copy of the best found extent taken before preallocation efforts */
 	struct ext4_free_extent ac_f_ex;
 
-	/* number of iterations done. we have to track to limit searching */
-	unsigned long ac_ex_scanned;
 	__u16 ac_groups_scanned;
 	__u16 ac_found;
 	__u16 ac_tail;
-- 
cgit v0.10.2


From ce140cdd9c171dc75cfdcfee2b8708c508f5daf6 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Thu, 20 Feb 2014 16:09:12 -0500
Subject: ext4: silence warnings in extent status tree debugging code

Adjust the conversion specifications in a few optionally compiled debug
messages to match the return type of ext4_es_status().  Also, make a
couple of minor grammatical message edits while we're at it.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index a900004..0a014a7 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
 	while (node) {
 		struct extent_status *es;
 		es = rb_entry(node, struct extent_status, rb_node);
-		printk(KERN_DEBUG " [%u/%u) %llu %llx",
+		printk(KERN_DEBUG " [%u/%u) %llu %x",
 		       es->es_lblk, es->es_len,
 		       ext4_es_pblock(es), ext4_es_status(es));
 		node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
 				pr_warn("ES insert assertion failed for "
 					"inode: %lu we can find an extent "
 					"at block [%d/%d/%llu/%c], but we "
-					"want to add an delayed/hole extent "
-					"[%d/%d/%llu/%llx]\n",
+					"want to add a delayed/hole extent "
+					"[%d/%d/%llu/%x]\n",
 					inode->i_ino, ee_block, ee_len,
 					ee_start, ee_status ? 'u' : 'w',
 					es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
 		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
 			pr_warn("ES insert assertion failed for inode: %lu "
 				"can't find an extent at block %d but we want "
-				"to add an written/unwritten extent "
-				"[%d/%d/%llu/%llx]\n", inode->i_ino,
+				"to add a written/unwritten extent "
+				"[%d/%d/%llu/%x]\n", inode->i_ino,
 				es->es_lblk, es->es_lblk, es->es_len,
 				ext4_es_pblock(es), ext4_es_status(es));
 		}
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
 			 */
 			pr_warn("ES insert assertion failed for inode: %lu "
 				"We can find blocks but we want to add a "
-				"delayed/hole extent [%d/%d/%llu/%llx]\n",
+				"delayed/hole extent [%d/%d/%llu/%x]\n",
 				inode->i_ino, es->es_lblk, es->es_len,
 				ext4_es_pblock(es), ext4_es_status(es));
 			return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
 		if (ext4_es_is_written(es)) {
 			pr_warn("ES insert assertion failed for inode: %lu "
 				"We can't find the block but we want to add "
-				"an written extent [%d/%d/%llu/%llx]\n",
+				"a written extent [%d/%d/%llu/%x]\n",
 				inode->i_ino, es->es_lblk, es->es_len,
 				ext4_es_pblock(es), ext4_es_status(es));
 			return;
-- 
cgit v0.10.2


From e251f9bca99c0f219eff9c76034476c2b17d3dba Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Thu, 20 Feb 2014 16:58:05 -0500
Subject: ext4: avoid exposure of stale data in ext4_punch_hole()

While handling punch-hole fallocate, it's useless to truncate page cache
before removing the range from extent tree (or block map in indirect case)
because page cache can be re-populated (by read-ahead or read(2) or mmap-ed
read) immediately after truncating page cache, but before updating extent
tree (or block map). In that case the user will see stale data even after
fallocate is completed.

Until the problem of data corruption resulting from pages backed by
already freed blocks is fully resolved, the simple thing we can do now
is to add another truncation of pagecache after punch hole is done.

Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 113458c..5324a38 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3614,6 +3614,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
+
+	/* Now release the pages again to reduce race window */
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
+
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
-- 
cgit v0.10.2


From a9b8241594adda0a7a4fb3b87bf29d2dff0d997d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 20 Feb 2014 21:17:35 -0500
Subject: ext4: merge uninitialized extents

Allow for merging uninitialized extents.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9875fd0..ef4b535 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1691,7 +1691,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 * the extent that was written properly split out and conversion to
 	 * initialized is trivial.
 	 */
-	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+	if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
 		return 0;
 
 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1708,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 */
 	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
 		return 0;
+	if (ext4_ext_is_uninitialized(ex1) &&
+	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
+	     (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
+		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (ext1_ee_len >= 4)
 		return 0;
@@ -1731,7 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 {
 	struct ext4_extent_header *eh;
 	unsigned int depth, len;
-	int merge_done = 0;
+	int merge_done = 0, uninit;
 
 	depth = ext_depth(inode);
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
 			break;
 		/* merge with next extent! */
+		uninit = ext4_ext_is_uninitialized(ex);
 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 				+ ext4_ext_get_actual_len(ex + 1));
+		if (uninit)
+			ext4_ext_mark_uninitialized(ex);
 
 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	struct ext4_ext_path *npath = NULL;
 	int depth, len, err;
 	ext4_lblk_t next;
-	int mb_flags = 0;
+	int mb_flags = 0, uninit;
 
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1954,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 						  path + depth);
 			if (err)
 				return err;
-
+			uninit = ext4_ext_is_uninitialized(ex);
 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
+			if (uninit)
+				ext4_ext_mark_uninitialized(ex);
 			eh = path[depth].p_hdr;
 			nearex = ex;
 			goto merge;
@@ -1971,10 +1981,13 @@ prepend:
 			if (err)
 				return err;
 
+			uninit = ext4_ext_is_uninitialized(ex);
 			ex->ee_block = newext->ee_block;
 			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
+			if (uninit)
+				ext4_ext_mark_uninitialized(ex);
 			eh = path[depth].p_hdr;
 			nearex = ex;
 			goto merge;
-- 
cgit v0.10.2


From a633f5a319cf4116d977e25fea2830dce23a8e74 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Sat, 22 Feb 2014 06:18:17 -0500
Subject: ext4: translate fallocate mode bits to strings

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534f..b7207db 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
+#include <linux/falloc.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ef4b535..2e0608e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 197d312..451e020 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -68,6 +68,11 @@ struct extent_status;
 	{ EXTENT_STATUS_DELAYED,	"D" },			\
 	{ EXTENT_STATUS_HOLE,		"H" })
 
+#define show_falloc_mode(mode) __print_flags(mode, "|",		\
+	{ FALLOC_FL_KEEP_SIZE,		"KEEP_SIZE"},		\
+	{ FALLOC_FL_PUNCH_HOLE,		"PUNCH_HOLE"},		\
+	{ FALLOC_FL_NO_HIDE_STALE,	"NO_HIDE_STALE"})
+
 
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
@@ -1349,10 +1354,10 @@ TRACE_EVENT(ext4_fallocate_enter,
 		__entry->mode	= mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
+	TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->pos,
-		  __entry->len, __entry->mode)
+		  __entry->len, show_falloc_mode(__entry->mode))
 );
 
 TRACE_EVENT(ext4_fallocate_exit,
-- 
cgit v0.10.2


From 9eb79482a97152930b113b51dff530aba9e28c8e Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Sun, 23 Feb 2014 15:18:59 -0500
Subject: ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

This patch implements fallocate's FALLOC_FL_COLLAPSE_RANGE for Ext4.

The semantics of this flag are following:
1) It collapses the range lying between offset and length by removing any data
   blocks which are present in this range and than updates all the logical
   offsets of extents beyond "offset + len" to nullify the hole created by
   removing blocks. In short, it does not leave a hole.
2) It should be used exclusively. No other fallocate flag in combination.
3) Offset and length supplied to fallocate should be fs block size aligned
   in case of xfs and ext4.
4) Collaspe range does not work beyond i_size.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Tested-by: Dongsu Park <dongsu.park@profitbricks.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7207db..beec427 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2758,6 +2758,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2767,6 +2768,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+			    struct ext4_extent **extent);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2e0608e..bbba1ef 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4581,12 +4581,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/* Return error if mode is not supported */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return ext4_punch_hole(inode, offset, len);
 
+	if (mode & FALLOC_FL_COLLAPSE_RANGE)
+		return ext4_collapse_range(inode, offset, len);
+
 	ret = ext4_convert_inline_data(inode);
 	if (ret)
 		return ret;
@@ -4885,3 +4889,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	ext4_es_lru_add(inode);
 	return error;
 }
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *path)
+{
+	int credits, err;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	/*
+	 * Check if need to extend journal credits
+	 * 3 for leaf, sb, and inode plus 2 (bmap and group
+	 * descriptor) for each block group; assume two block
+	 * groups
+	 */
+	if (handle->h_buffer_credits < 7) {
+		credits = ext4_writepage_trans_blocks(inode);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		/* EAGAIN is success */
+		if (err && err != -EAGAIN)
+			return err;
+	}
+
+	err = ext4_ext_get_access(handle, inode, path);
+	return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+			    struct inode *inode, handle_t *handle,
+			    ext4_lblk_t *start)
+{
+	int depth, err = 0;
+	struct ext4_extent *ex_start, *ex_last;
+	bool update = 0;
+	depth = path->p_depth;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			ex_start = path[depth].p_ext;
+			if (!ex_start)
+				return -EIO;
+
+			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+			if (!ex_last)
+				return -EIO;
+
+			err = ext4_access_path(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+				update = 1;
+
+			*start = ex_last->ee_block +
+				ext4_ext_get_actual_len(ex_last);
+
+			while (ex_start <= ex_last) {
+				ex_start->ee_block -= shift;
+				if (ex_start >
+					EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+					if (ext4_ext_try_to_merge_right(inode,
+						path, ex_start - 1))
+						ex_last--;
+				}
+				ex_start++;
+			}
+			err = ext4_ext_dirty(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (--depth < 0 || !update)
+				break;
+		}
+
+		/* Update index too */
+		err = ext4_access_path(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		path[depth].p_idx->ei_block -= shift;
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		/* we are done if current index is not a starting index */
+		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+			break;
+
+		depth--;
+	}
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+		       ext4_lblk_t start, ext4_lblk_t shift)
+{
+	struct ext4_ext_path *path;
+	int ret = 0, depth;
+	struct ext4_extent *extent;
+	ext4_lblk_t stop_block, current_block;
+	ext4_lblk_t ex_start, ex_end;
+
+	/* Let path point to the last extent */
+	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	depth = path->p_depth;
+	extent = path[depth].p_ext;
+	if (!extent) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		return ret;
+	}
+
+	stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	/* Nothing to shift, if hole is at the end of file */
+	if (start >= stop_block)
+		return ret;
+
+	/*
+	 * Don't start shifting extents until we make sure the hole is big
+	 * enough to accomodate the shift.
+	 */
+	path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+	depth = path->p_depth;
+	extent =  path[depth].p_ext;
+	ex_start = extent->ee_block;
+	ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	if ((start == ex_start && shift > ex_start) ||
+	    (shift > start - ex_end))
+		return -EINVAL;
+
+	/* Its safe to start updating extents */
+	while (start < stop_block) {
+		path = ext4_ext_find_extent(inode, start, NULL, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = path->p_depth;
+		extent = path[depth].p_ext;
+		current_block = extent->ee_block;
+		if (start > current_block) {
+			/* Hole, move to the next extent */
+			ret = mext_next_extent(inode, path, &extent);
+			if (ret != 0) {
+				ext4_ext_drop_refs(path);
+				kfree(path);
+				if (ret == 1)
+					ret = 0;
+				break;
+			}
+		}
+		ret = ext4_ext_shift_path_extents(path, shift, inode,
+				handle, &start);
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t punch_start, punch_stop;
+	handle_t *handle;
+	unsigned int credits;
+	loff_t new_size;
+	int ret;
+
+	BUG_ON(offset + len > i_size_read(inode));
+
+	/* Collapse range works only on fs block size aligned offsets. */
+	if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+	    len & (EXT4_BLOCK_SIZE(sb) - 1))
+		return -EINVAL;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	trace_ext4_collapse_range(inode, offset, len);
+
+	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
+	if (ret)
+		return ret;
+
+	/* Take mutex lock */
+	mutex_lock(&inode->i_mutex);
+
+	/* It's not possible punch hole on append only file */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+		ret = -EPERM;
+		goto out_mutex;
+	}
+
+	if (IS_SWAPFILE(inode)) {
+		ret = -ETXTBSY;
+		goto out_mutex;
+	}
+
+	/* Currently just for extent based files */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	truncate_pagecache_range(inode, offset, -1);
+
+	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_dio;
+	}
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_es_remove_extent(inode, punch_start,
+				    EXT_MAX_BLOCKS - punch_start - 1);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+				     punch_stop - punch_start);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	new_size = i_size_read(inode) - len;
+	truncate_setsize(inode, new_size);
+	EXT4_I(inode)->i_disksize = new_size;
+
+	ext4_discard_preallocations(inode);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f39a88a..58ee7dc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
  * ext4_ext_path structure refers to the last extent, or a negative error
  * value on failure.
  */
-static int
+int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 451e020..e9d7ee7 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -16,6 +16,11 @@ struct mpage_da_data;
 struct ext4_map_blocks;
 struct extent_status;
 
+/* shim until we merge in the xfs_collapse_range branch */
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE	0x08
+#endif
+
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
 #define show_mballoc_flags(flags) __print_flags(flags, "|",	\
@@ -71,7 +76,8 @@ struct extent_status;
 #define show_falloc_mode(mode) __print_flags(mode, "|",		\
 	{ FALLOC_FL_KEEP_SIZE,		"KEEP_SIZE"},		\
 	{ FALLOC_FL_PUNCH_HOLE,		"PUNCH_HOLE"},		\
-	{ FALLOC_FL_NO_HIDE_STALE,	"NO_HIDE_STALE"})
+	{ FALLOC_FL_NO_HIDE_STALE,	"NO_HIDE_STALE"},	\
+	{ FALLOC_FL_COLLAPSE_RANGE,	"COLLAPSE_RANGE"})
 
 
 TRACE_EVENT(ext4_free_inode,
@@ -2415,6 +2421,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
 		  __entry->shrunk_nr, __entry->cache_cnt)
 );
 
+TRACE_EVENT(ext4_collapse_range,
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
+
+	TP_ARGS(inode, offset, len),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t,	offset)
+		__field(loff_t, len)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->offset	= offset;
+		__entry->len	= len;
+	),
+
+	TP_printk("dev %d,%d ino %lu offset %lld len %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->offset, __entry->len)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v0.10.2


From 10542c229a4e8e25b40357beea66abe9dacda2c0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Mar 2014 10:50:50 -0500
Subject: ext4: Speedup WB_SYNC_ALL pass called from sync(2)

When doing filesystem wide sync, there's no need to force transaction
commit (or synchronously write inode buffer) separately for each inode
because ext4_sync_fs() takes care of forcing commit at the end (VFS
takes care of flushing buffer cache, respectively). Most of the time
this slowness doesn't manifest because previous WB_SYNC_NONE writeback
doesn't leave much to write but when there are processes aggressively
creating new files and several filesystems to sync, the sync slowness
can be noticeable. In the following test script sync(1) takes around 6
minutes when there are two ext4 filesystems mounted on a standard SATA
drive. After this patch sync takes a couple of seconds so we have about
two orders of magnitude improvement.

      function run_writers
      {
        for (( i = 0; i < 10; i++ )); do
          mkdir $1/dir$i
          for (( j = 0; j < 40000; j++ )); do
            dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
          done &
        done
      }

      for dir in "$@"; do
        run_writers $dir
      done

      sleep 40
      time sync

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5324a38..ab3e835 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4455,7 +4455,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 			return -EIO;
 		}
 
-		if (wbc->sync_mode != WB_SYNC_ALL)
+		/*
+		 * No need to force transaction in WB_SYNC_NONE mode. Also
+		 * ext4_sync_fs() will force the commit after everything is
+		 * written.
+		 */
+		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
 			return 0;
 
 		err = ext4_force_commit(inode->i_sb);
@@ -4465,7 +4470,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		err = __ext4_get_inode_loc(inode, &iloc, 0);
 		if (err)
 			return err;
-		if (wbc->sync_mode == WB_SYNC_ALL)
+		/*
+		 * sync(2) will flush the whole buffer cache. No need to do
+		 * it here separately for each inode.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
-- 
cgit v0.10.2


From df3c1e9a05ff25aca9f54a6c08b77003e2e32bf1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 8 Mar 2014 18:13:52 -0500
Subject: jbd2: don't unplug after writing revoke records

During commit process, keep the block device plugged after we are done
writing the revoke records, until we are finished writing the rest of
the commit records in the journal.  This will allow most of the
journal blocks to be written in a single I/O operation, instead of
separating the the revoke blocks from the rest of the journal blocks.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc05..765b31d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(journal, commit_transaction,
 					  &log_bufs, WRITE_SYNC);
-	blk_finish_plug(&plug);
 
 	jbd_debug(3, "JBD2: commit phase 2b\n");
 
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	err = 0;
 	bufs = 0;
 	descriptor = NULL;
-	blk_start_plug(&plug);
 	while (commit_transaction->t_buffers) {
 
 		/* Find the next buffer to be journaled... */
-- 
cgit v0.10.2


From 3469a32a1e948c54204b5dd6f7476a7d11349e9e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 8 Mar 2014 19:11:36 -0500
Subject: jbd2: don't hold j_state_lock while calling wake_up()

The j_state_lock is one of the hottest locks in the jbd2 layer and
thus one of its scalability bottlenecks.

We don't need to be holding the j_state_lock while we are calling
wake_up(&journal->j_wait_commit), so release the lock a little bit
earlier.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 244b6f6..67b8e30 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
 	journal->j_flags |= JBD2_UNMOUNT;
 
 	while (journal->j_task) {
-		wake_up(&journal->j_wait_commit);
 		write_unlock(&journal->j_state_lock);
+		wake_up(&journal->j_wait_commit);
 		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
 		write_lock(&journal->j_state_lock);
 	}
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
-		wake_up(&journal->j_wait_commit);
 		read_unlock(&journal->j_state_lock);
+		wake_up(&journal->j_wait_commit);
 		wait_event(journal->j_wait_done_commit,
 				!tid_gt(tid, journal->j_commit_sequence));
 		read_lock(&journal->j_state_lock);
-- 
cgit v0.10.2


From 42cf3452d5f5b0817d27c93e4e7d7eab6e89077d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 8 Mar 2014 19:51:16 -0500
Subject: jbd2: calculate statistics without holding j_state_lock and
 j_list_lock

The two hottest locks, and thus the biggest scalability bottlenecks,
in the jbd2 layer, are the j_list_lock and j_state_lock.  This has
inspired some people to do some truly unnatural things[1].

[1] https://www.usenix.org/system/files/conference/fast14/fast14-paper_kang.pdf

We don't need to be holding both j_state_lock and j_list_lock while
calculating the journal statistics, so move those calculations to the
very end of jbd2_journal_commit_transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 765b31d..af36252 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1083,24 +1083,7 @@ restart_loop:
 		atomic_read(&commit_transaction->t_handle_count);
 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 			     commit_transaction->t_tid, &stats.run);
-
-	/*
-	 * Calculate overall stats
-	 */
-	spin_lock(&journal->j_history_lock);
-	journal->j_stats.ts_tid++;
-	if (commit_transaction->t_requested)
-		journal->j_stats.ts_requested++;
-	journal->j_stats.run.rs_wait += stats.run.rs_wait;
-	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
-	journal->j_stats.run.rs_running += stats.run.rs_running;
-	journal->j_stats.run.rs_locked += stats.run.rs_locked;
-	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
-	journal->j_stats.run.rs_logging += stats.run.rs_logging;
-	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
-	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
-	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
-	spin_unlock(&journal->j_history_lock);
+	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
 
 	commit_transaction->t_state = T_COMMIT_CALLBACK;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1157,4 +1140,21 @@ restart_loop:
 	spin_unlock(&journal->j_list_lock);
 	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_done_commit);
+
+	/*
+	 * Calculate overall stats
+	 */
+	spin_lock(&journal->j_history_lock);
+	journal->j_stats.ts_tid++;
+	journal->j_stats.ts_requested += stats.ts_requested;
+	journal->j_stats.run.rs_wait += stats.run.rs_wait;
+	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+	journal->j_stats.run.rs_running += stats.run.rs_running;
+	journal->j_stats.run.rs_locked += stats.run.rs_locked;
+	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+	journal->j_stats.run.rs_logging += stats.run.rs_logging;
+	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+	spin_unlock(&journal->j_history_lock);
 }
-- 
cgit v0.10.2


From d4e839d4a9dc31d0c229e616146b01e1ace56604 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 8 Mar 2014 22:34:10 -0500
Subject: jbd2: add transaction to checkpoint list earlier

We don't otherwise need j_list_lock during the rest of commit phase
#7, so add the transaction to the checkpoint list at the very end of
commit phase #6.  This allows us to drop j_list_lock earlier, which is
a good thing since it is a super hot lock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index af36252..5f26139 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1065,6 +1065,25 @@ restart_loop:
 		goto restart_loop;
 	}
 
+	/* Add the transaction to the checkpoint list
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
+	if (journal->j_checkpoint_transactions == NULL) {
+		journal->j_checkpoint_transactions = commit_transaction;
+		commit_transaction->t_cpnext = commit_transaction;
+		commit_transaction->t_cpprev = commit_transaction;
+	} else {
+		commit_transaction->t_cpnext =
+			journal->j_checkpoint_transactions;
+		commit_transaction->t_cpprev =
+			commit_transaction->t_cpnext->t_cpprev;
+		commit_transaction->t_cpnext->t_cpprev =
+			commit_transaction;
+		commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+
 	/* Done with this transaction! */
 
 	jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1103,24 +1122,6 @@ restart_loop:
 
 	write_unlock(&journal->j_state_lock);
 
-	if (journal->j_checkpoint_transactions == NULL) {
-		journal->j_checkpoint_transactions = commit_transaction;
-		commit_transaction->t_cpnext = commit_transaction;
-		commit_transaction->t_cpprev = commit_transaction;
-	} else {
-		commit_transaction->t_cpnext =
-			journal->j_checkpoint_transactions;
-		commit_transaction->t_cpprev =
-			commit_transaction->t_cpnext->t_cpprev;
-		commit_transaction->t_cpnext->t_cpprev =
-			commit_transaction;
-		commit_transaction->t_cpprev->t_cpnext =
-				commit_transaction;
-	}
-	spin_unlock(&journal->j_list_lock);
-	/* Drop all spin_locks because commit_callback may be block.
-	 * __journal_remove_checkpoint() can not destroy transaction
-	 * under us because it is not marked as T_FINISHED yet */
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
@@ -1131,7 +1132,7 @@ restart_loop:
 	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	commit_transaction->t_state = T_FINISHED;
-	/* Recheck checkpoint lists after j_list_lock was dropped */
+	/* Check if the transaction can be dropped now that we are finished */
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
-- 
cgit v0.10.2


From d2eb0b998990abf51d6e1d3bf16a2637b920a660 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Mar 2014 00:07:19 -0500
Subject: jbd2: check jh->b_transaction without taking j_list_lock

jh->b_transaction is adequately protected for reading by the
jbd_lock_bh_state(bh), so we don't need to take j_list_lock in
__journal_try_to_free_buffer().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365..78900a1 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1821,11 +1821,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 	if (buffer_locked(bh) || buffer_dirty(bh))
 		goto out;
 
-	if (jh->b_next_transaction != NULL)
+	if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL) {
 		/* written-back checkpointed metadata buffer */
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		__jbd2_journal_remove_checkpoint(jh);
-- 
cgit v0.10.2


From 6e4862a5bb9d12be87e4ea5d9a60836ebed71d28 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Mar 2014 00:46:23 -0500
Subject: jbd2: minimize region locked by j_list_lock in
 journal_get_create_access()

It's not needed until we start trying to modifying fields in the
journal_head which are protected by j_list_lock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 78900a1..357f3dc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	 * reused here.
 	 */
 	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
 	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
 		jh->b_transaction == NULL ||
 		(jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 		jh->b_modified = 0;
 
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
 		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
 	} else if (jh->b_transaction == journal->j_committing_transaction) {
 		/* first access by this transaction */
 		jh->b_modified = 0;
 
 		JBUFFER_TRACE(jh, "set next transaction");
+		spin_lock(&journal->j_list_lock);
 		jh->b_next_transaction = transaction;
 	}
 	spin_unlock(&journal->j_list_lock);
-- 
cgit v0.10.2


From 0bfea8118d8e4f6aeb476511350d649e8dcb0ce8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Mar 2014 00:56:58 -0500
Subject: jbd2: minimize region locked by j_list_lock in jbd2_journal_forget()

It's not needed until we start trying to modifying fields in the
journal_head which are protected by j_list_lock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 357f3dc..d999b1f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1416,7 +1416,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 	BUFFER_TRACE(bh, "entry");
 
 	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
 
 	if (!buffer_jbd(bh))
 		goto not_jbd;
@@ -1469,6 +1468,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 		 * we know to remove the checkpoint after we commit.
 		 */
 
+		spin_lock(&journal->j_list_lock);
 		if (jh->b_cp_transaction) {
 			__jbd2_journal_temp_unlink_buffer(jh);
 			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1481,6 +1481,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 				goto drop;
 			}
 		}
+		spin_unlock(&journal->j_list_lock);
 	} else if (jh->b_transaction) {
 		J_ASSERT_JH(jh, (jh->b_transaction ==
 				 journal->j_committing_transaction));
@@ -1492,7 +1493,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 
 		if (jh->b_next_transaction) {
 			J_ASSERT(jh->b_next_transaction == transaction);
+			spin_lock(&journal->j_list_lock);
 			jh->b_next_transaction = NULL;
+			spin_unlock(&journal->j_list_lock);
 
 			/*
 			 * only drop a reference if this transaction modified
@@ -1504,7 +1507,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 	}
 
 not_jbd:
-	spin_unlock(&journal->j_list_lock);
 	jbd_unlock_bh_state(bh);
 	__brelse(bh);
 drop:
-- 
cgit v0.10.2


From 66a4cb187b92ca8663203fe8fda621e6585a2a00 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 12 Mar 2014 16:38:03 -0400
Subject: jbd2: improve error messages for inconsistent journal heads

Fix up error messages printed when the transaction pointers in a
journal head are inconsistent.  This improves the error messages which
are printed when running xfstests generic/068 in data=journal mode.
See the bug report at: https://bugzilla.kernel.org/show_bug.cgi?id=60786

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de..c3fb607 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 		if (WARN_ON_ONCE(err)) {
 			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
+			if (inode == NULL) {
+				pr_err("EXT4: jbd2_journal_dirty_metadata "
+				       "failed: handle type %u started at "
+				       "line %u, credits %u/%u, errcode %d",
+				       handle->h_type,
+				       handle->h_line_no,
+				       handle->h_requested_credits,
+				       handle->h_buffer_credits, err);
+				return err;
+			}
 			ext4_error_inode(inode, where, line,
 					 bh->b_blocknr,
 					 "journal_dirty_metadata failed: "
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d999b1f..38cfcf5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1313,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 			     journal->j_running_transaction)) {
 			printk(KERN_ERR "JBD2: %s: "
 			       "jh->b_transaction (%llu, %p, %u) != "
-			       "journal->j_running_transaction (%p, %u)",
+			       "journal->j_running_transaction (%p, %u)\n",
 			       journal->j_devname,
 			       (unsigned long long) bh->b_blocknr,
 			       jh->b_transaction,
@@ -1336,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 */
 	if (jh->b_transaction != transaction) {
 		JBUFFER_TRACE(jh, "already on other transaction");
-		if (unlikely(jh->b_transaction !=
-			     journal->j_committing_transaction)) {
-			printk(KERN_ERR "JBD2: %s: "
-			       "jh->b_transaction (%llu, %p, %u) != "
-			       "journal->j_committing_transaction (%p, %u)",
+		if (unlikely(((jh->b_transaction !=
+			       journal->j_committing_transaction)) ||
+			     (jh->b_next_transaction != transaction))) {
+			printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
+			       "bad jh for block %llu: "
+			       "transaction (%p, %u), "
+			       "jh->b_transaction (%p, %u), "
+			       "jh->b_next_transaction (%p, %u), jlist %u\n",
 			       journal->j_devname,
 			       (unsigned long long) bh->b_blocknr,
+			       transaction, transaction->t_tid,
 			       jh->b_transaction,
-			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
-			       journal->j_committing_transaction,
-			       journal->j_committing_transaction ?
-			       journal->j_committing_transaction->t_tid : 0);
-			ret = -EINVAL;
-		}
-		if (unlikely(jh->b_next_transaction != transaction)) {
-			printk(KERN_ERR "JBD2: %s: "
-			       "jh->b_next_transaction (%llu, %p, %u) != "
-			       "transaction (%p, %u)",
-			       journal->j_devname,
-			       (unsigned long long) bh->b_blocknr,
+			       jh->b_transaction ?
+			       jh->b_transaction->t_tid : 0,
 			       jh->b_next_transaction,
 			       jh->b_next_transaction ?
 			       jh->b_next_transaction->t_tid : 0,
-			       transaction, transaction->t_tid);
+			       jh->b_jlist);
+			WARN_ON(1);
 			ret = -EINVAL;
 		}
 		/* And this case is illegal: we can't reuse another
-- 
cgit v0.10.2


From 02b9984d640873b7b3809e63f81a0d7e13496886 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 13 Mar 2014 10:14:33 -0400
Subject: fs: push sync_filesystem() down to the file system's remount_fs()

Previously, the no-op "mount -o mount /dev/xxx" operation when the
file system is already mounted read-write causes an implied,
unconditional syncfs().  This seems pretty stupid, and it's certainly
documented or guaraunteed to do this, nor is it particularly useful,
except in the case where the file system was mounted rw and is getting
remounted read-only.

However, it's possible that there might be some file systems that are
actually depending on this behavior.  In most file systems, it's
probably fine to only call sync_filesystem() when transitioning from
read-write to read-only, and there are some file systems where this is
not needed at all (for example, for a pseudo-filesystem or something
like romfs).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Jan Kara <jack@suse.cz>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Anders Larsen <al@alarsen.net>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: Petr Vandrovec <petr@vandrovec.name>
Cc: xfs@oss.sgi.com
Cc: linux-btrfs@vger.kernel.org
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Cc: codalist@coda.cs.cmu.edu
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: fuse-devel@lists.sourceforge.net
Cc: cluster-devel@redhat.com
Cc: linux-mtd@lists.infradead.org
Cc: jfs-discussion@lists.sourceforge.net
Cc: linux-nfs@vger.kernel.org
Cc: linux-nilfs@vger.kernel.org
Cc: linux-ntfs-dev@lists.sourceforge.net
Cc: ocfs2-devel@oss.oracle.com
Cc: reiserfs-devel@vger.kernel.org

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003c..952aeb0 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
 
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	return parse_options(sb, data);
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731..3074530 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -530,6 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 
 	pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
 
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 
 	memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 845d2d6..56d70c8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	if (!(*flags & MS_RDONLY))
 		return -EINVAL;
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 97cc241..00cd0c5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1381,6 +1381,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
+	sync_filesystem(sb);
 	btrfs_remount_prepare(fs_info);
 
 	ret = btrfs_parse_options(root, data);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f613..4942c94 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	return 0;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 506de34..3f48000 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
 
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NOATIME;
 	return 0;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf..a275911 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -244,6 +244,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444c..02928a9 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
 	int err;
 	struct debugfs_fs_info *fsi = sb->s_fs_info;
 
+	sync_filesystem(sb);
 	err = debugfs_parse_options(data, &fsi->mount_opts);
 	if (err)
 		goto fail;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f..c710380 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
+	sync_filesystem(sb);
 	err = parse_mount_options(data, PARSE_REMOUNT, opts);
 
 	/*
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 50215bb..103bbd8 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
 
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697..d260115 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
+	sync_filesystem(sb);
 	spin_lock(&sbi->s_lock);
 
 	/* Store the old options */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31e..95c6c5a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
+	sync_filesystem(sb);
+
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7a829f7..a5f1170 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4765,6 +4765,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 
+	sync_filesystem(sb);
+
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83..856bdf9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -568,6 +568,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	struct f2fs_mount_info org_mount_opt;
 	int err, active_logs;
 
+	sync_filesystem(sb);
+
 	/*
 	 * Save the old mount options in case we
 	 * need to restore them.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578..343e477 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
 
+	sync_filesystem(sb);
+
 	/* make sure we update state on remount. */
 	new_rdonly = *flags & MS_RDONLY;
 	if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb27..7ca8c75 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d468643..ecdb255d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
 
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	if (*flags & MS_MANDLOCK)
 		return -EINVAL;
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 60f60f6..4c6dd50 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1175,6 +1175,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	struct gfs2_tune *gt = &sdp->sd_tune;
 	int error;
 
+	sync_filesystem(sb);
+
 	spin_lock(&gt->gt_spin);
 	args.ar_commit = gt->gt_logd_secs;
 	args.ar_quota_quantum = gt->gt_quota_quantum;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e..eee7206 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa..8eb787b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff6..fe3463a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	char *new_opts = kstrdup(data, GFP_KERNEL);
 	
+	sync_filesystem(s);
+
 	*flags |= MS_NOATIME;
 	
 	hpfs_lock(s);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10e..6af66ee 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
 
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	if (!(*flags & MS_RDONLY))
 		return -EROFS;
 	return 0;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1c..0918f0e 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 	int err;
 
+	sync_filesystem(sb);
 	err = jffs2_parse_options(c, data);
 	if (err)
 		return -EINVAL;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483..97f7fda 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	int flag = JFS_SBI(sb)->flag;
 	int ret;
 
+	sync_filesystem(sb);
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109..dcdc298 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
 	struct minix_sb_info * sbi = minix_sb(sb);
 	struct minix_super_block * ms;
 
+	sync_filesystem(sb);
 	ms = sbi->s_ms;
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebe..5f86e80 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -99,6 +99,7 @@ static void destroy_inodecache(void)
 
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	return 0;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed90..2cb5694 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
 	u32 nfsvers = nfss->nfs_client->rpc_ops->version;
 
+	sync_filesystem(sb);
+
 	/*
 	 * Userspace mount programs that send binary options generally send
 	 * them populated with default values. We have no way to know which
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a12..8c532b2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_mount_opt;
 	int err;
 
+	sync_filesystem(sb);
 	old_sb_flags = sb->s_flags;
 	old_mount_opt = nilfs->ns_mount_opt;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d5..bd5610d 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -468,6 +468,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	ntfs_debug("Entering with remount options string: %s", opt);
 
+	sync_filesystem(sb);
+
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 49d84f8..5f9bf8f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -631,6 +631,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	u32 tmp;
 
+	sync_filesystem(sb);
+
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
 	    !ocfs2_check_set_options(sb, &parsed_options)) {
 		ret = -EINVAL;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8..15e4500 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NOATIME;
 	return 0;
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbe..ac823a8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 int proc_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct pid_namespace *pid = sb->s_fs_info;
+
+	sync_filesystem(sb);
 	return !proc_parse_options(data, pid);
 }
 
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1282384..192297b 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
 
 static int pstore_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	parse_options(data);
 
 	return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810..c4bcb77 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct qnx4_sb_info *qs;
 
+	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
 	*flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941ed..65cdaab 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2c80335..abf2b76 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1319,6 +1319,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	int i;
 #endif
 
+	sync_filesystem(s);
 	reiserfs_write_lock(s);
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d841878..ef90e8b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  */
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df63..031c8d67 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2..e9dc3c3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 
-	sync_filesystem(sb);
-
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		if (retval) {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4e..4742e58 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
+	sync_filesystem(sb);
 	if (sbi->s_forced_ro)
 		*flags |= MS_RDONLY;
 	return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded849..e1598ab 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	int err;
 	struct ubifs_info *c = sb->s_fs_info;
 
+	sync_filesystem(sb);
 	dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
 
 	err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f..64f2b733 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -646,6 +646,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	int error = 0;
 	struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
 
+	sync_filesystem(sb);
 	if (lvidiu) {
 		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
 		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f5..b8c6791 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1280,6 +1280,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
 
+	sync_filesystem(sb);
 	lock_ufs(sb);
 	mutex_lock(&UFS_SB(sb)->s_lock);
 	uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f317488..aaa3eca 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
 	char			*p;
 	int			error;
 
+	sync_filesystem(sb);
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 
-- 
cgit v0.10.2


From 38c03b34391dd25a39576073e58485e5949d29fe Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 13 Mar 2014 22:49:42 -0400
Subject: ext4: only call sync_filesystm() when remounting read-only

This is the only time it is required for ext4.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a5f1170..89baee4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4765,8 +4765,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 
-	sync_filesystem(sb);
-
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -4837,6 +4835,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 
 		if (*flags & MS_RDONLY) {
+			err = sync_filesystem(sb);
+			if (err < 0)
+				goto restore_opts;
 			err = dquot_suspend(sb, -1);
 			if (err < 0)
 				goto restore_opts;
-- 
cgit v0.10.2


From 31cf0f2c3195258f83adabf1a71a782a92b8268a Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Thu, 13 Mar 2014 23:14:46 -0400
Subject: ext4: delete path dealloc code in
 ext4_ext_handle_uninitialized_extents

Code deallocating the extent path referenced by an argument to
ext4_ext_handle_uninitialized_extents was made redundant with identical
code in its one caller, ext4_ext_map_blocks, by commit 3779473246.
Allocating and deallocating the path in the same function also makes
the code clearer.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bbba1ef..17b2fb2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4005,10 +4005,6 @@ out1:
 	map->m_pblk = newblock;
 	map->m_len = allocated;
 out2:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
 	return err ? err : allocated;
 }
 
@@ -4208,7 +4204,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 				err = ret;
 			else
 				allocated = ret;
-			goto out3;
+			goto out2;
 		}
 	}
 
@@ -4489,7 +4485,6 @@ out2:
 		kfree(path);
 	}
 
-out3:
 	trace_ext4_ext_map_blocks_exit(inode, flags, map,
 				       err ? err : allocated);
 	ext4_es_lru_add(inode);
-- 
cgit v0.10.2


From c06344939422bbd032ac967223a7863de57496b5 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Thu, 13 Mar 2014 23:34:16 -0400
Subject: ext4: fix partial cluster handling for bigalloc file systems

Commit 9cb00419fa, which enables hole punching for bigalloc file
systems, exposed a bug introduced by commit 6ae06ff51e in an earlier
release.  When run on a bigalloc file system, xfstests generic/013, 068,
075, 083, 091, 100, 112, 127, 263, 269, and 270 fail with e2fsck errors
or cause kernel error messages indicating that previously freed blocks
are being freed again.

The latter commit optimizes the selection of the starting extent in
ext4_ext_rm_leaf() when hole punching by beginning with the extent
supplied in the path argument rather than with the last extent in the
leaf node (as is still done when truncating).  However, the code in
rm_leaf that initially sets partial_cluster to track cluster sharing on
extent boundaries is only guaranteed to run if rm_leaf starts with the
last node in the leaf.  Consequently, partial_cluster is not correctly
initialized when hole punching, and a cluster on the boundary of a
punched region that should be retained may instead be deallocated.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 17b2fb2..e35f93b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2597,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
+	/*
+	 * If we're starting with an extent other than the last one in the
+	 * node, we need to see if it shares a cluster with the extent to
+	 * the right (towards the end of the file). If its leftmost cluster
+	 * is this extent's rightmost cluster and it is not cluster aligned,
+	 * we'll mark it as a partial that is not to be deallocated.
+	 */
+
+	if (ex != EXT_LAST_EXTENT(eh)) {
+		ext4_fsblk_t current_pblk, right_pblk;
+		long long current_cluster, right_cluster;
+
+		current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+		current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+		right_pblk = ext4_ext_pblock(ex + 1);
+		right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+		if (current_cluster == right_cluster &&
+			EXT4_PBLK_COFF(sbi, right_pblk))
+			*partial_cluster = -right_cluster;
+	}
+
 	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
-- 
cgit v0.10.2


From f282ac19d86f0507e91759dcf3d15fcb3a964d2a Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 18 Mar 2014 17:44:35 -0400
Subject: ext4: Update inode i_size after the preallocation

Currently in ext4_fallocate we would update inode size, c_time and sync
the file with every partial allocation which is entirely unnecessary. It
is true that if the crash happens in the middle of truncate we might end
up with unchanged i size, or c_time which I do not think is really a
problem - it does not mean file system corruption in any way. Note that
xfs is doing things the same way e.g. update all of the mentioned after
the allocation is done.

This commit moves all the updates after the allocation is done. In
addition we also need to change m_time as not only inode has been change
bot also data regions might have changed (unwritten extents). However
m_time will be only updated when i_size changed.

Also we do not need to be paranoid about changing the c_time only if the
actual allocation have happened, we can change it even if we try to
allocate only to find out that there are already block allocated. It's
not really a big deal and it will save us some additional complexity.

Also use ext4_debug, instead of ext4_warning in #ifdef EXT4FS_DEBUG
section.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>-
--
v3: Do not remove the code to set EXT4_INODE_EOFBLOCKS flag

 fs/ext4/extents.c | 96 ++++++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 54 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e35f93b..e4be6b7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4546,36 +4546,6 @@ retry:
 	ext4_std_error(inode->i_sb, err);
 }
 
-static void ext4_falloc_update_inode(struct inode *inode,
-				int mode, loff_t new_size, int update_ctime)
-{
-	struct timespec now;
-
-	if (update_ctime) {
-		now = current_fs_time(inode->i_sb);
-		if (!timespec_equal(&inode->i_ctime, &now))
-			inode->i_ctime = now;
-	}
-	/*
-	 * Update only when preallocation was requested beyond
-	 * the file size.
-	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
-		if (new_size > i_size_read(inode))
-			i_size_write(inode, new_size);
-		if (new_size > EXT4_I(inode)->i_disksize)
-			ext4_update_i_disksize(inode, new_size);
-	} else {
-		/*
-		 * Mark that we allocate beyond EOF so the subsequent truncate
-		 * can proceed even if the new size is the same as i_size.
-		 */
-		if (new_size > i_size_read(inode))
-			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	}
-
-}
-
 /*
  * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
@@ -4587,13 +4557,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	handle_t *handle;
-	loff_t new_size;
+	loff_t new_size = 0;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
 	int flags;
 	struct ext4_map_blocks map;
+	struct timespec tv;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/* Return error if mode is not supported */
@@ -4631,12 +4602,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	mutex_lock(&inode->i_mutex);
-	ret = inode_newsize_ok(inode, (len + offset));
-	if (ret) {
-		mutex_unlock(&inode->i_mutex);
-		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-		return ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out;
 	}
+
 	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
@@ -4660,28 +4634,14 @@ retry:
 		}
 		ret = ext4_map_blocks(handle, inode, &map, flags);
 		if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-			ext4_warning(inode->i_sb,
-				     "inode #%lu: block %u: len %u: "
-				     "ext4_ext_map_blocks returned %d",
-				     inode->i_ino, map.m_lblk,
-				     map.m_len, ret);
-#endif
+			ext4_debug("inode #%lu: block %u: len %u: "
+				   "ext4_ext_map_blocks returned %d",
+				   inode->i_ino, map.m_lblk,
+				   map.m_len, ret);
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
-		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-						blkbits) >> blkbits))
-			new_size = offset + len;
-		else
-			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
-
-		ext4_falloc_update_inode(inode, mode, new_size,
-					 (map.m_flags & EXT4_MAP_NEW));
-		ext4_mark_inode_dirty(handle, inode);
-		if ((file->f_flags & O_SYNC) && ret >= max_blocks)
-			ext4_handle_sync(handle);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
 			break;
@@ -4691,6 +4651,34 @@ retry:
 		ret = 0;
 		goto retry;
 	}
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		goto out;
+
+	tv = inode->i_ctime = ext4_current_time(inode);
+
+	if (ret > 0 && new_size) {
+		if (new_size > i_size_read(inode)) {
+			i_size_write(inode, new_size);
+			inode->i_mtime = tv;
+		}
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
+	} else if (ret > 0 && !new_size) {
+		/*
+		* Mark that we allocate beyond EOF so the subsequent truncate
+		* can proceed even if the new size is the same as i_size.
+		*/
+		if ((offset + len) > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
+	ext4_mark_inode_dirty(handle, inode);
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+out:
 	mutex_unlock(&inode->i_mutex);
 	trace_ext4_fallocate_exit(inode, offset, max_blocks,
 				ret > 0 ? ret2 : ret);
-- 
cgit v0.10.2


From 0e8b6879f3c234036181526683be2b0231892ae4 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 18 Mar 2014 18:03:51 -0400
Subject: ext4: refactor ext4_fallocate code

Move block allocation out of the ext4_fallocate into separate function
called ext4_alloc_file_blocks(). This will allow us to use the same
allocation code for other allocation operations such as zero range which
is commit in the next patch.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e4be6b7..2db2d77 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4546,6 +4546,64 @@ retry:
 	ext4_std_error(inode->i_sb, err);
 }
 
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+				  ext4_lblk_t len, int flags, int mode)
+{
+	struct inode *inode = file_inode(file);
+	handle_t *handle;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits;
+
+	map.m_lblk = offset;
+	/*
+	 * Don't normalize the request if it can fit in one extent so
+	 * that it doesn't get unnecessarily split into multiple
+	 * extents.
+	 */
+	if (len <= EXT_UNINIT_MAX_LEN)
+		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+	/*
+	 * credits to insert 1 extent into extent tree
+	 */
+	credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+	while (ret >= 0 && ret < len) {
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = len = len - ret;
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		ret = ext4_map_blocks(handle, inode, &map, flags);
+		if (ret <= 0) {
+			ext4_debug("inode #%lu: block %u: len %u: "
+				   "ext4_ext_map_blocks returned %d",
+				   inode->i_ino, map.m_lblk,
+				   map.m_len, ret);
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+	if (ret == -ENOSPC &&
+			ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		ret = 0;
+		goto retry;
+	}
+
+	return ret > 0 ? ret2 : ret;
+}
+
 /*
  * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
@@ -4560,12 +4618,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	loff_t new_size = 0;
 	unsigned int max_blocks;
 	int ret = 0;
-	int ret2 = 0;
-	int retries = 0;
 	int flags;
-	struct ext4_map_blocks map;
+	ext4_lblk_t lblk;
 	struct timespec tv;
-	unsigned int credits, blkbits = inode->i_blkbits;
+	unsigned int blkbits = inode->i_blkbits;
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
@@ -4590,17 +4646,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		return -EOPNOTSUPP;
 
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
-	map.m_lblk = offset >> blkbits;
+	lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-		- map.m_lblk;
-	/*
-	 * credits to insert 1 extent into extent tree
-	 */
-	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		- lblk;
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
 	mutex_lock(&inode->i_mutex);
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -4611,46 +4668,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			goto out;
 	}
 
-	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
-	if (mode & FALLOC_FL_KEEP_SIZE)
-		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-	/*
-	 * Don't normalize the request if it can fit in one extent so
-	 * that it doesn't get unnecessarily split into multiple
-	 * extents.
-	 */
-	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-
-retry:
-	while (ret >= 0 && ret < max_blocks) {
-		map.m_lblk = map.m_lblk + ret;
-		map.m_len = max_blocks = max_blocks - ret;
-		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-					    credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			break;
-		}
-		ret = ext4_map_blocks(handle, inode, &map, flags);
-		if (ret <= 0) {
-			ext4_debug("inode #%lu: block %u: len %u: "
-				   "ext4_ext_map_blocks returned %d",
-				   inode->i_ino, map.m_lblk,
-				   map.m_len, ret);
-			ext4_mark_inode_dirty(handle, inode);
-			ret2 = ext4_journal_stop(handle);
-			break;
-		}
-		ret2 = ext4_journal_stop(handle);
-		if (ret2)
-			break;
-	}
-	if (ret == -ENOSPC &&
-			ext4_should_retry_alloc(inode->i_sb, &retries)) {
-		ret = 0;
-		goto retry;
-	}
+	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+	if (ret)
+		goto out;
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
@@ -4658,14 +4678,14 @@ retry:
 
 	tv = inode->i_ctime = ext4_current_time(inode);
 
-	if (ret > 0 && new_size) {
+	if (!ret && new_size) {
 		if (new_size > i_size_read(inode)) {
 			i_size_write(inode, new_size);
 			inode->i_mtime = tv;
 		}
 		if (new_size > EXT4_I(inode)->i_disksize)
 			ext4_update_i_disksize(inode, new_size);
-	} else if (ret > 0 && !new_size) {
+	} else if (!ret && !new_size) {
 		/*
 		* Mark that we allocate beyond EOF so the subsequent truncate
 		* can proceed even if the new size is the same as i_size.
@@ -4680,9 +4700,8 @@ retry:
 	ext4_journal_stop(handle);
 out:
 	mutex_unlock(&inode->i_mutex);
-	trace_ext4_fallocate_exit(inode, offset, max_blocks,
-				ret > 0 ? ret2 : ret);
-	return ret > 0 ? ret2 : ret;
+	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From b8a8684502a0fc852afa0056c6bb2a9273f6fcc0 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 18 Mar 2014 18:05:35 -0400
Subject: ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate

Introduce new FALLOC_FL_ZERO_RANGE flag for fallocate. This has the same
functionality as xfs ioctl XFS_IOC_ZERO_RANGE.

It can be used to convert a range of file to zeros preferably without
issuing data IO. Blocks should be preallocated for the regions that span
holes in the file, and the entire range is preferable converted to
unwritten extents

This can be also used to preallocate blocks past EOF in the same way as
with fallocate. Flag FALLOC_FL_KEEP_SIZE which should cause the inode
size to remain the same.

Also add appropriate tracepoints.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index beec427..1b3cbf8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -568,6 +568,8 @@ enum {
 #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Do not put hole in extent cache */
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
+	/* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0400
 
 /*
  * The bit position of these flags must not overlap with any of the
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2db2d77..464e95d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3602,6 +3602,8 @@ out:
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
  * One of more index blocks maybe needed if the extent tree grow after
  * the uninitialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
@@ -3612,7 +3614,7 @@ out:
  *
  * Returns the size of uninitialized extent to be written on success.
  */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
 					struct inode *inode,
 					struct ext4_map_blocks *map,
 					struct ext4_ext_path *path,
@@ -3624,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	unsigned int ee_len;
 	int split_flag = 0, depth;
 
-	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
-		"block %llu, max_blocks %u\n", inode->i_ino,
-		(unsigned long long)map->m_lblk, map->m_len);
+	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+		  __func__, inode->i_ino,
+		  (unsigned long long)map->m_lblk, map->m_len);
 
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
@@ -3641,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 
-	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-	split_flag |= EXT4_EXT_MARK_UNINIT2;
-	if (flags & EXT4_GET_BLOCKS_CONVERT)
-		split_flag |= EXT4_EXT_DATA_VALID2;
+	/* Convert to unwritten */
+	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+		split_flag |= EXT4_EXT_DATA_VALID1;
+	/* Convert to initialized */
+	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		split_flag |= ee_block + ee_len <= eof_block ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+	}
 	flags |= EXT4_GET_BLOCKS_PRE_IO;
 	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
 
+static int ext4_convert_initialized_extents(handle_t *handle,
+					    struct inode *inode,
+					    struct ext4_map_blocks *map,
+					    struct ext4_ext_path *path)
+{
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("%s: inode %lu, logical"
+		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+		err = ext4_split_convert_extents(handle, inode, map, path,
+				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+		if (err < 0)
+			goto out;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as uninitialized */
+	ext4_ext_mark_uninitialized(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 						struct inode *inode,
 						struct ext4_map_blocks *map,
@@ -3682,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 			     inode->i_ino, (unsigned long long)ee_block, ee_len,
 			     (unsigned long long)map->m_lblk, map->m_len);
 #endif
-		err = ext4_split_unwritten_extents(handle, inode, map, path,
-						   EXT4_GET_BLOCKS_CONVERT);
+		err = ext4_split_convert_extents(handle, inode, map, path,
+						 EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
 			goto out;
 		ext4_ext_drop_refs(path);
@@ -3884,6 +3945,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 
 static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			struct ext4_ext_path *path, int flags,
+			unsigned int allocated, ext4_fsblk_t newblock)
+{
+	int ret = 0;
+	int err = 0;
+
+	/*
+	 * Make sure that the extent is no bigger than we support with
+	 * uninitialized extent
+	 */
+	if (map->m_len > EXT_UNINIT_MAX_LEN)
+		map->m_len = EXT_UNINIT_MAX_LEN / 2;
+
+	ret = ext4_convert_initialized_extents(handle, inode, map,
+						path);
+	if (ret >= 0) {
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, map->m_len);
+	} else
+		err = ret;
+	map->m_flags |= EXT4_MAP_UNWRITTEN;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_len = allocated;
+
+	return err ? err : allocated;
+}
+
+static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
 			struct ext4_ext_path *path, int flags,
@@ -3910,8 +4003,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-		ret = ext4_split_unwritten_extents(handle, inode, map,
-						   path, flags);
+		ret = ext4_split_convert_extents(handle, inode, map,
+					 path, flags | EXT4_GET_BLOCKS_CONVERT);
 		if (ret <= 0)
 			goto out;
 		/*
@@ -4199,6 +4292,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 		unsigned short ee_len;
 
+
 		/*
 		 * Uninitialized extents are treated as holes, except that
 		 * we split out initialized portions during a write.
@@ -4215,7 +4309,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
 				  ee_block, ee_len, newblock);
 
-			if (!ext4_ext_is_uninitialized(ex))
+			/*
+			 * If the extent is initialized check whether the
+			 * caller wants to convert it to unwritten.
+			 */
+			if ((!ext4_ext_is_uninitialized(ex)) &&
+			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+				allocated = ext4_ext_convert_initialized_extent(
+						handle, inode, map, path, flags,
+						allocated, newblock);
+				goto out2;
+			} else if (!ext4_ext_is_uninitialized(ex))
 				goto out;
 
 			ret = ext4_ext_handle_uninitialized_extents(
@@ -4604,6 +4708,144 @@ retry:
 	return ret > 0 ? ret2 : ret;
 }
 
+static long ext4_zero_range(struct file *file, loff_t offset,
+			    loff_t len, int mode)
+{
+	struct inode *inode = file_inode(file);
+	handle_t *handle = NULL;
+	unsigned int max_blocks;
+	loff_t new_size = 0;
+	int ret = 0;
+	int flags;
+	int partial;
+	loff_t start, end;
+	ext4_lblk_t lblk;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned int blkbits = inode->i_blkbits;
+
+	trace_ext4_zero_range(inode, offset, len, mode);
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + len - 1);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Round up offset. This is not fallocate, we neet to zero out
+	 * blocks, so convert interior block aligned part of the range to
+	 * unwritten and possibly manually zero out unaligned parts of the
+	 * range.
+	 */
+	start = round_up(offset, 1 << blkbits);
+	end = round_down((offset + len), 1 << blkbits);
+
+	if (start < offset || end > offset + len)
+		return -EINVAL;
+	partial = (offset + len) & ((1 << blkbits) - 1);
+
+	lblk = start >> blkbits;
+	max_blocks = (end >> blkbits);
+	if (max_blocks < lblk)
+		max_blocks = 0;
+	else
+		max_blocks -= lblk;
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Indirect files do not support unwritten extnets
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out_mutex;
+		/*
+		 * If we have a partial block after EOF we have to allocate
+		 * the entire block.
+		 */
+		if (partial)
+			max_blocks += 1;
+	}
+
+	if (max_blocks > 0) {
+
+		/* Now release the pages and zero block aligned part of pages*/
+		truncate_pagecache_range(inode, start, end - 1);
+
+		/* Wait all existing dio workers, newcomers will block on i_mutex */
+		ext4_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+
+		/*
+		 * Remove entire range from the extent status tree.
+		 */
+		ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+		if (ret)
+			goto out_dio;
+
+		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+					     mode);
+		if (ret)
+			goto out_dio;
+	}
+
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(inode->i_sb, ret);
+		goto out_dio;
+	}
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+	if (!ret && new_size) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
+	} else if (!ret && !new_size) {
+		/*
+		* Mark that we allocate beyond EOF so the subsequent truncate
+		* can proceed even if the new size is the same as i_size.
+		*/
+		if ((offset + len) > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
+
+	ext4_mark_inode_dirty(handle, inode);
+
+	/* Zero out partial block at the edges of the range */
+	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 /*
  * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
@@ -4625,7 +4867,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4645,6 +4887,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
+	if (mode & FALLOC_FL_ZERO_RANGE)
+		return ext4_zero_range(file, offset, len, mode);
+
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	lblk = offset >> blkbits;
 	/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ab3e835..7cc2455 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -503,6 +503,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 {
 	struct extent_status es;
 	int retval;
+	int ret = 0;
 #ifdef ES_AGGRESSIVE_TEST
 	struct ext4_map_blocks orig_map;
 
@@ -558,7 +559,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
 	if (retval > 0) {
-		int ret;
 		unsigned int status;
 
 		if (unlikely(retval != map->m_len)) {
@@ -585,7 +585,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode, map);
+		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -602,7 +602,13 @@ found:
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-		return retval;
+		/*
+		 * If we need to convert extent to unwritten
+		 * we continue and do the actual work in
+		 * ext4_ext_map_blocks()
+		 */
+		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+			return retval;
 
 	/*
 	 * Here we clear m_flags because after allocating an new extent,
@@ -658,7 +664,6 @@ found:
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
 	if (retval > 0) {
-		int ret;
 		unsigned int status;
 
 		if (unlikely(retval != map->m_len)) {
@@ -693,7 +698,7 @@ found:
 has_zeroout:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode, map);
+		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -3507,7 +3512,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	trace_ext4_punch_hole(inode, offset, length);
+	trace_ext4_punch_hole(inode, offset, length, 0);
 
 	/*
 	 * Write out all dirty pages to avoid race conditions
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index e9d7ee7..010ea89 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -21,6 +21,10 @@ struct extent_status;
 #define FALLOC_FL_COLLAPSE_RANGE	0x08
 #endif
 
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE           0x10
+#endif
+
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
 #define show_mballoc_flags(flags) __print_flags(flags, "|",	\
@@ -77,7 +81,8 @@ struct extent_status;
 	{ FALLOC_FL_KEEP_SIZE,		"KEEP_SIZE"},		\
 	{ FALLOC_FL_PUNCH_HOLE,		"PUNCH_HOLE"},		\
 	{ FALLOC_FL_NO_HIDE_STALE,	"NO_HIDE_STALE"},	\
-	{ FALLOC_FL_COLLAPSE_RANGE,	"COLLAPSE_RANGE"})
+	{ FALLOC_FL_COLLAPSE_RANGE,	"COLLAPSE_RANGE"},	\
+	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"})
 
 
 TRACE_EVENT(ext4_free_inode,
@@ -1339,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
 		  __entry->rw, __entry->ret)
 );
 
-TRACE_EVENT(ext4_fallocate_enter,
+DECLARE_EVENT_CLASS(ext4__fallocate_mode,
 	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
 
 	TP_ARGS(inode, offset, len, mode),
@@ -1347,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
-		__field(	loff_t,	pos			)
-		__field(	loff_t,	len			)
+		__field(	loff_t,	offset			)
+		__field(	loff_t, len			)
 		__field(	int,	mode			)
 	),
 
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
-		__entry->pos	= offset;
+		__entry->offset	= offset;
 		__entry->len	= len;
 		__entry->mode	= mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %s",
+	TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->pos,
-		  __entry->len, show_falloc_mode(__entry->mode))
+		  (unsigned long) __entry->ino,
+		  __entry->offset, __entry->len,
+		  show_falloc_mode(__entry->mode))
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
+
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+	TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
+
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+	TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
+
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+	TP_ARGS(inode, offset, len, mode)
 );
 
 TRACE_EVENT(ext4_fallocate_exit,
@@ -1395,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
 		  __entry->ret)
 );
 
-TRACE_EVENT(ext4_punch_hole,
-	TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
-
-	TP_ARGS(inode, offset, len),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	loff_t,	offset			)
-		__field(	loff_t, len			)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->offset	= offset;
-		__entry->len	= len;
-	),
-
-	TP_printk("dev %d,%d ino %lu offset %lld len %lld",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->offset, __entry->len)
-);
-
 TRACE_EVENT(ext4_unlink_enter,
 	TP_PROTO(struct inode *parent, struct dentry *dentry),
 
-- 
cgit v0.10.2


From 3e037e5211252902a188a6a11aecd247409d0229 Mon Sep 17 00:00:00 2001
From: T Makphaibulchoke <tmac@hp.com>
Date: Tue, 18 Mar 2014 19:19:41 -0400
Subject: fs/mbcache.c: change block and index hash chain to hlist_bl_node

This patch changes each mb_cache's both block and index hash chains to
use a hlist_bl_node, which contains a built-in lock.  This is the
first step in decoupling of locks serializing accesses to mb_cache
global data and each mb_cache_entry local data.

Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45..55db0da 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -34,9 +34,9 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/list_bl.h>
 #include <linux/mbcache.h>
-
+#include <linux/init.h>
 
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
@@ -87,21 +87,38 @@ static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-	return !list_empty(&ce->e_block_list);
+	return !hlist_bl_unhashed(&ce->e_block_list);
 }
 
 
-static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+static inline void
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-	if (__mb_cache_entry_is_hashed(ce)) {
-		list_del_init(&ce->e_block_list);
-		list_del(&ce->e_index.o_list);
-	}
+	if (__mb_cache_entry_is_block_hashed(ce))
+		hlist_bl_del_init(&ce->e_block_list);
+}
+
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+	return !hlist_bl_unhashed(&ce->e_index.o_list);
 }
 
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+	if (__mb_cache_entry_is_index_hashed(ce))
+		hlist_bl_del_init(&ce->e_index.o_list);
+}
+
+static inline void
+__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+{
+	__mb_cache_entry_unhash_index(ce);
+	__mb_cache_entry_unhash_block(ce);
+}
 
 static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
@@ -125,7 +142,7 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
 		ce->e_used -= MB_CACHE_WRITER;
 	ce->e_used--;
 	if (!(ce->e_used || ce->e_queued)) {
-		if (!__mb_cache_entry_is_hashed(ce))
+		if (!__mb_cache_entry_is_block_hashed(ce))
 			goto forget;
 		mb_assert(list_empty(&ce->e_lru_list));
 		list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
@@ -221,18 +238,18 @@ mb_cache_create(const char *name, int bucket_bits)
 	cache->c_name = name;
 	atomic_set(&cache->c_entry_count, 0);
 	cache->c_bucket_bits = bucket_bits;
-	cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
-	                              GFP_KERNEL);
+	cache->c_block_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
 	if (!cache->c_block_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
-		INIT_LIST_HEAD(&cache->c_block_hash[n]);
-	cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
-				      GFP_KERNEL);
+		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+	cache->c_index_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
 	if (!cache->c_index_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
-		INIT_LIST_HEAD(&cache->c_index_hash[n]);
+		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
 	cache->c_entry_cache = kmem_cache_create(name,
 		sizeof(struct mb_cache_entry), 0,
 		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
@@ -364,10 +381,13 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 			return NULL;
 		atomic_inc(&cache->c_entry_count);
 		INIT_LIST_HEAD(&ce->e_lru_list);
-		INIT_LIST_HEAD(&ce->e_block_list);
+		INIT_HLIST_BL_NODE(&ce->e_block_list);
+		INIT_HLIST_BL_NODE(&ce->e_index.o_list);
 		ce->e_cache = cache;
 		ce->e_queued = 0;
 	}
+	ce->e_block_hash_p = &cache->c_block_hash[0];
+	ce->e_index_hash_p = &cache->c_index_hash[0];
 	ce->e_used = 1 + MB_CACHE_WRITER;
 	return ce;
 }
@@ -393,25 +413,32 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 {
 	struct mb_cache *cache = ce->e_cache;
 	unsigned int bucket;
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	int error = -EBUSY;
+	struct hlist_bl_head *block_hash_p;
+	struct hlist_bl_head *index_hash_p;
+	struct mb_cache_entry *lce;
 
+	mb_assert(ce);
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
 			   cache->c_bucket_bits);
+	block_hash_p = &cache->c_block_hash[bucket];
 	spin_lock(&mb_cache_spinlock);
-	list_for_each_prev(l, &cache->c_block_hash[bucket]) {
-		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_block_list);
-		if (ce->e_bdev == bdev && ce->e_block == block)
+	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+		if (lce->e_bdev == bdev && lce->e_block == block)
 			goto out;
 	}
+	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
 	__mb_cache_entry_unhash(ce);
 	ce->e_bdev = bdev;
 	ce->e_block = block;
-	list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+	ce->e_block_hash_p = block_hash_p;
 	ce->e_index.o_key = key;
 	bucket = hash_long(key, cache->c_bucket_bits);
-	list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
+	index_hash_p = &cache->c_index_hash[bucket];
+	ce->e_index_hash_p = index_hash_p;
+	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
 	error = 0;
 out:
 	spin_unlock(&mb_cache_spinlock);
@@ -463,14 +490,16 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 		   sector_t block)
 {
 	unsigned int bucket;
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	struct mb_cache_entry *ce;
+	struct hlist_bl_head *block_hash_p;
 
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
 			   cache->c_bucket_bits);
+	block_hash_p = &cache->c_block_hash[bucket];
 	spin_lock(&mb_cache_spinlock);
-	list_for_each(l, &cache->c_block_hash[bucket]) {
-		ce = list_entry(l, struct mb_cache_entry, e_block_list);
+	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+		mb_assert(ce->e_block_hash_p == block_hash_p);
 		if (ce->e_bdev == bdev && ce->e_block == block) {
 			DEFINE_WAIT(wait);
 
@@ -489,7 +518,7 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 			finish_wait(&mb_cache_queue, &wait);
 			ce->e_used += 1 + MB_CACHE_WRITER;
 
-			if (!__mb_cache_entry_is_hashed(ce)) {
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
 				__mb_cache_entry_release_unlock(ce);
 				return NULL;
 			}
@@ -506,12 +535,14 @@ cleanup:
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 
 static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
 		      struct block_device *bdev, unsigned int key)
 {
-	while (l != head) {
+	while (l != NULL) {
 		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_index.o_list);
+			hlist_bl_entry(l, struct mb_cache_entry,
+				e_index.o_list);
+		mb_assert(ce->e_index_hash_p == head);
 		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
 			DEFINE_WAIT(wait);
 
@@ -532,7 +563,7 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
 			}
 			finish_wait(&mb_cache_queue, &wait);
 
-			if (!__mb_cache_entry_is_hashed(ce)) {
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
 				__mb_cache_entry_release_unlock(ce);
 				spin_lock(&mb_cache_spinlock);
 				return ERR_PTR(-EAGAIN);
@@ -562,12 +593,16 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
 			  unsigned int key)
 {
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-	struct list_head *l;
-	struct mb_cache_entry *ce;
+	struct hlist_bl_node *l;
+	struct mb_cache_entry *ce = NULL;
+	struct hlist_bl_head *index_hash_p;
 
+	index_hash_p = &cache->c_index_hash[bucket];
 	spin_lock(&mb_cache_spinlock);
-	l = cache->c_index_hash[bucket].next;
-	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+	if (!hlist_bl_empty(index_hash_p)) {
+		l = hlist_bl_first(index_hash_p);
+		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+	}
 	spin_unlock(&mb_cache_spinlock);
 	return ce;
 }
@@ -597,12 +632,16 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
 {
 	struct mb_cache *cache = prev->e_cache;
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	struct mb_cache_entry *ce;
+	struct hlist_bl_head *index_hash_p;
 
+	index_hash_p = &cache->c_index_hash[bucket];
+	mb_assert(prev->e_index_hash_p == index_hash_p);
 	spin_lock(&mb_cache_spinlock);
+	mb_assert(!hlist_bl_empty(index_hash_p));
 	l = prev->e_index.o_list.next;
-	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
 	__mb_cache_entry_release_unlock(prev);
 	return ce;
 }
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 5525d37..6a392e7 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -3,19 +3,21 @@
 
   (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-
 struct mb_cache_entry {
 	struct list_head		e_lru_list;
 	struct mb_cache			*e_cache;
 	unsigned short			e_used;
 	unsigned short			e_queued;
+	atomic_t			e_refcnt;
 	struct block_device		*e_bdev;
 	sector_t			e_block;
-	struct list_head		e_block_list;
+	struct hlist_bl_node		e_block_list;
 	struct {
-		struct list_head	o_list;
+		struct hlist_bl_node	o_list;
 		unsigned int		o_key;
 	} e_index;
+	struct hlist_bl_head		*e_block_hash_p;
+	struct hlist_bl_head		*e_index_hash_p;
 };
 
 struct mb_cache {
@@ -25,8 +27,8 @@ struct mb_cache {
 	int				c_max_entries;
 	int				c_bucket_bits;
 	struct kmem_cache		*c_entry_cache;
-	struct list_head		*c_block_hash;
-	struct list_head		*c_index_hash;
+	struct hlist_bl_head		*c_block_hash;
+	struct hlist_bl_head		*c_index_hash;
 };
 
 /* Functions on caches */
-- 
cgit v0.10.2


From 1f3e55fe02d12213f87869768aa2b0bad3ba9a7d Mon Sep 17 00:00:00 2001
From: T Makphaibulchoke <tmac@hp.com>
Date: Tue, 18 Mar 2014 19:23:20 -0400
Subject: fs/mbcache.c: doucple the locking of local from global data

The patch increases the parallelism of mbcache by using the built-in
lock in the hlist_bl_node to protect the mb_cache's local block and
index hash chains.  The global data mb_cache_lru_list and
mb_cache_list continue to be protected by the global
mb_cache_spinlock.

New block group spinlock, mb_cache_bg_lock is also added to serialize
accesses to mb_cache_entry's local data.

A new member e_refcnt is added to the mb_cache_entry structure to help
preventing an mb_cache_entry from being deallocated by a free while it
is being referenced by either mb_cache_entry_get() or
mb_cache_entry_find().

Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 55db0da..786ecab 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
  * back on the lru list.
  */
 
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order.  While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain.  For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 
@@ -37,6 +72,7 @@
 #include <linux/list_bl.h>
 #include <linux/mbcache.h>
 #include <linux/init.h>
+#include <linux/blockgroup_lock.h>
 
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
@@ -57,8 +93,13 @@
 
 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
 
+#define MB_CACHE_ENTRY_LOCK_BITS	__builtin_log2(NR_BG_LOCKS)
+#define	MB_CACHE_ENTRY_LOCK_INDEX(ce)			\
+	(hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
+
 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-		
+static struct blockgroup_lock *mb_cache_bg_lock;
+
 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
 MODULE_LICENSE("GPL");
@@ -86,6 +127,20 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+	spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+	spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
 static inline int
 __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
@@ -113,11 +168,21 @@ __mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
 		hlist_bl_del_init(&ce->e_index.o_list);
 }
 
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
 static inline void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
 {
 	__mb_cache_entry_unhash_index(ce);
+	hlist_bl_unlock(ce->e_index_hash_p);
 	__mb_cache_entry_unhash_block(ce);
+	hlist_bl_unlock(ce->e_block_hash_p);
 }
 
 static void
@@ -125,36 +190,47 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
 	struct mb_cache *cache = ce->e_cache;
 
-	mb_assert(!(ce->e_used || ce->e_queued));
+	mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
 	kmem_cache_free(cache->c_entry_cache, ce);
 	atomic_dec(&cache->c_entry_count);
 }
 
-
 static void
-__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-	__releases(mb_cache_spinlock)
+__mb_cache_entry_release(struct mb_cache_entry *ce)
 {
+	/* First lock the entry to serialize access to its local data. */
+	__spin_lock_mb_cache_entry(ce);
 	/* Wake up all processes queuing for this cache entry. */
 	if (ce->e_queued)
 		wake_up_all(&mb_cache_queue);
 	if (ce->e_used >= MB_CACHE_WRITER)
 		ce->e_used -= MB_CACHE_WRITER;
+	/*
+	 * Make sure that all cache entries on lru_list have
+	 * both e_used and e_qued of 0s.
+	 */
 	ce->e_used--;
-	if (!(ce->e_used || ce->e_queued)) {
-		if (!__mb_cache_entry_is_block_hashed(ce))
+	if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+		if (!__mb_cache_entry_is_block_hashed(ce)) {
+			__spin_unlock_mb_cache_entry(ce);
 			goto forget;
-		mb_assert(list_empty(&ce->e_lru_list));
-		list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+		}
+		/*
+		 * Need access to lru list, first drop entry lock,
+		 * then reacquire the lock in the proper order.
+		 */
+		spin_lock(&mb_cache_spinlock);
+		if (list_empty(&ce->e_lru_list))
+			list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+		spin_unlock(&mb_cache_spinlock);
 	}
-	spin_unlock(&mb_cache_spinlock);
+	__spin_unlock_mb_cache_entry(ce);
 	return;
 forget:
-	spin_unlock(&mb_cache_spinlock);
+	mb_assert(list_empty(&ce->e_lru_list));
 	__mb_cache_entry_forget(ce, GFP_KERNEL);
 }
 
-
 /*
  * mb_cache_shrink_scan()  memory pressure callback
  *
@@ -177,17 +253,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 
 	mb_debug("trying to free %d entries", nr_to_scan);
 	spin_lock(&mb_cache_spinlock);
-	while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+	while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
 		struct mb_cache_entry *ce =
 			list_entry(mb_cache_lru_list.next,
-				   struct mb_cache_entry, e_lru_list);
-		list_move_tail(&ce->e_lru_list, &free_list);
-		__mb_cache_entry_unhash(ce);
-		freed++;
+				struct mb_cache_entry, e_lru_list);
+		list_del_init(&ce->e_lru_list);
+		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
+			continue;
+		spin_unlock(&mb_cache_spinlock);
+		/* Prevent any find or get operation on the entry */
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		/* Ignore if it is touched by a find/get */
+		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+			!list_empty(&ce->e_lru_list)) {
+			hlist_bl_unlock(ce->e_index_hash_p);
+			hlist_bl_unlock(ce->e_block_hash_p);
+			spin_lock(&mb_cache_spinlock);
+			continue;
+		}
+		__mb_cache_entry_unhash_unlock(ce);
+		list_add_tail(&ce->e_lru_list, &free_list);
+		spin_lock(&mb_cache_spinlock);
 	}
 	spin_unlock(&mb_cache_spinlock);
+
 	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
 		__mb_cache_entry_forget(entry, gfp_mask);
+		freed++;
 	}
 	return freed;
 }
@@ -232,6 +325,14 @@ mb_cache_create(const char *name, int bucket_bits)
 	int n, bucket_count = 1 << bucket_bits;
 	struct mb_cache *cache = NULL;
 
+	if (!mb_cache_bg_lock) {
+		mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+			GFP_KERNEL);
+		if (!mb_cache_bg_lock)
+			return NULL;
+		bgl_lock_init(mb_cache_bg_lock);
+	}
+
 	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)
 		return NULL;
@@ -290,21 +391,47 @@ void
 mb_cache_shrink(struct block_device *bdev)
 {
 	LIST_HEAD(free_list);
-	struct list_head *l, *ltmp;
+	struct list_head *l;
+	struct mb_cache_entry *ce, *tmp;
 
+	l = &mb_cache_lru_list;
 	spin_lock(&mb_cache_spinlock);
-	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_lru_list);
+	while (!list_is_last(l, &mb_cache_lru_list)) {
+		l = l->next;
+		ce = list_entry(l, struct mb_cache_entry, e_lru_list);
 		if (ce->e_bdev == bdev) {
-			list_move_tail(&ce->e_lru_list, &free_list);
-			__mb_cache_entry_unhash(ce);
+			list_del_init(&ce->e_lru_list);
+			if (ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt))
+				continue;
+			spin_unlock(&mb_cache_spinlock);
+			/*
+			 * Prevent any find or get operation on the entry.
+			 */
+			hlist_bl_lock(ce->e_block_hash_p);
+			hlist_bl_lock(ce->e_index_hash_p);
+			/* Ignore if it is touched by a find/get */
+			if (ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt) ||
+				!list_empty(&ce->e_lru_list)) {
+				hlist_bl_unlock(ce->e_index_hash_p);
+				hlist_bl_unlock(ce->e_block_hash_p);
+				l = &mb_cache_lru_list;
+				spin_lock(&mb_cache_spinlock);
+				continue;
+			}
+			__mb_cache_entry_unhash_unlock(ce);
+			mb_assert(!(ce->e_used || ce->e_queued ||
+				atomic_read(&ce->e_refcnt)));
+			list_add_tail(&ce->e_lru_list, &free_list);
+			l = &mb_cache_lru_list;
+			spin_lock(&mb_cache_spinlock);
 		}
 	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_safe(l, ltmp, &free_list) {
-		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-						   e_lru_list), GFP_KERNEL);
+
+	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(ce, GFP_KERNEL);
 	}
 }
 
@@ -320,23 +447,27 @@ void
 mb_cache_destroy(struct mb_cache *cache)
 {
 	LIST_HEAD(free_list);
-	struct list_head *l, *ltmp;
+	struct mb_cache_entry *ce, *tmp;
 
 	spin_lock(&mb_cache_spinlock);
-	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_lru_list);
-		if (ce->e_cache == cache) {
+	list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
+		if (ce->e_cache == cache)
 			list_move_tail(&ce->e_lru_list, &free_list);
-			__mb_cache_entry_unhash(ce);
-		}
 	}
 	list_del(&cache->c_cache_list);
 	spin_unlock(&mb_cache_spinlock);
 
-	list_for_each_safe(l, ltmp, &free_list) {
-		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-						   e_lru_list), GFP_KERNEL);
+	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+		list_del_init(&ce->e_lru_list);
+		/*
+		 * Prevent any find or get operation on the entry.
+		 */
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		mb_assert(!(ce->e_used || ce->e_queued ||
+			atomic_read(&ce->e_refcnt)));
+		__mb_cache_entry_unhash_unlock(ce);
+		__mb_cache_entry_forget(ce, GFP_KERNEL);
 	}
 
 	if (atomic_read(&cache->c_entry_count) > 0) {
@@ -345,8 +476,6 @@ mb_cache_destroy(struct mb_cache *cache)
 			  atomic_read(&cache->c_entry_count));
 	}
 
-	kmem_cache_destroy(cache->c_entry_cache);
-
 	kfree(cache->c_index_hash);
 	kfree(cache->c_block_hash);
 	kfree(cache);
@@ -363,29 +492,59 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-	struct mb_cache_entry *ce = NULL;
+	struct mb_cache_entry *ce;
 
 	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		struct list_head *l;
+
+		l = &mb_cache_lru_list;
 		spin_lock(&mb_cache_spinlock);
-		if (!list_empty(&mb_cache_lru_list)) {
-			ce = list_entry(mb_cache_lru_list.next,
-					struct mb_cache_entry, e_lru_list);
-			list_del_init(&ce->e_lru_list);
-			__mb_cache_entry_unhash(ce);
+		while (!list_is_last(l, &mb_cache_lru_list)) {
+			l = l->next;
+			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+			if (ce->e_cache == cache) {
+				list_del_init(&ce->e_lru_list);
+				if (ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt))
+					continue;
+				spin_unlock(&mb_cache_spinlock);
+				/*
+				 * Prevent any find or get operation on the
+				 * entry.
+				 */
+				hlist_bl_lock(ce->e_block_hash_p);
+				hlist_bl_lock(ce->e_index_hash_p);
+				/* Ignore if it is touched by a find/get */
+				if (ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt) ||
+					!list_empty(&ce->e_lru_list)) {
+					hlist_bl_unlock(ce->e_index_hash_p);
+					hlist_bl_unlock(ce->e_block_hash_p);
+					l = &mb_cache_lru_list;
+					spin_lock(&mb_cache_spinlock);
+					continue;
+				}
+				mb_assert(list_empty(&ce->e_lru_list));
+				mb_assert(!(ce->e_used || ce->e_queued ||
+					atomic_read(&ce->e_refcnt)));
+				__mb_cache_entry_unhash_unlock(ce);
+				goto found;
+			}
 		}
 		spin_unlock(&mb_cache_spinlock);
 	}
-	if (!ce) {
-		ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-		if (!ce)
-			return NULL;
-		atomic_inc(&cache->c_entry_count);
-		INIT_LIST_HEAD(&ce->e_lru_list);
-		INIT_HLIST_BL_NODE(&ce->e_block_list);
-		INIT_HLIST_BL_NODE(&ce->e_index.o_list);
-		ce->e_cache = cache;
-		ce->e_queued = 0;
-	}
+
+	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+	if (!ce)
+		return NULL;
+	atomic_inc(&cache->c_entry_count);
+	INIT_LIST_HEAD(&ce->e_lru_list);
+	INIT_HLIST_BL_NODE(&ce->e_block_list);
+	INIT_HLIST_BL_NODE(&ce->e_index.o_list);
+	ce->e_cache = cache;
+	ce->e_queued = 0;
+	atomic_set(&ce->e_refcnt, 0);
+found:
 	ce->e_block_hash_p = &cache->c_block_hash[0];
 	ce->e_index_hash_p = &cache->c_index_hash[0];
 	ce->e_used = 1 + MB_CACHE_WRITER;
@@ -414,7 +573,6 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 	struct mb_cache *cache = ce->e_cache;
 	unsigned int bucket;
 	struct hlist_bl_node *l;
-	int error = -EBUSY;
 	struct hlist_bl_head *block_hash_p;
 	struct hlist_bl_head *index_hash_p;
 	struct mb_cache_entry *lce;
@@ -423,26 +581,29 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
 			   cache->c_bucket_bits);
 	block_hash_p = &cache->c_block_hash[bucket];
-	spin_lock(&mb_cache_spinlock);
+	hlist_bl_lock(block_hash_p);
 	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
-		if (lce->e_bdev == bdev && lce->e_block == block)
-			goto out;
+		if (lce->e_bdev == bdev && lce->e_block == block) {
+			hlist_bl_unlock(block_hash_p);
+			return -EBUSY;
+		}
 	}
 	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
-	__mb_cache_entry_unhash(ce);
+	__mb_cache_entry_unhash_block(ce);
+	__mb_cache_entry_unhash_index(ce);
 	ce->e_bdev = bdev;
 	ce->e_block = block;
 	ce->e_block_hash_p = block_hash_p;
 	ce->e_index.o_key = key;
+	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+	hlist_bl_unlock(block_hash_p);
 	bucket = hash_long(key, cache->c_bucket_bits);
 	index_hash_p = &cache->c_index_hash[bucket];
+	hlist_bl_lock(index_hash_p);
 	ce->e_index_hash_p = index_hash_p;
 	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
-	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
-	error = 0;
-out:
-	spin_unlock(&mb_cache_spinlock);
-	return error;
+	hlist_bl_unlock(index_hash_p);
+	return 0;
 }
 
 
@@ -456,24 +617,26 @@ out:
 void
 mb_cache_entry_release(struct mb_cache_entry *ce)
 {
-	spin_lock(&mb_cache_spinlock);
-	__mb_cache_entry_release_unlock(ce);
+	__mb_cache_entry_release(ce);
 }
 
 
 /*
  * mb_cache_entry_free()
  *
- * This is equivalent to the sequence mb_cache_entry_takeout() --
- * mb_cache_entry_release().
  */
 void
 mb_cache_entry_free(struct mb_cache_entry *ce)
 {
-	spin_lock(&mb_cache_spinlock);
+	mb_assert(ce);
 	mb_assert(list_empty(&ce->e_lru_list));
-	__mb_cache_entry_unhash(ce);
-	__mb_cache_entry_release_unlock(ce);
+	hlist_bl_lock(ce->e_index_hash_p);
+	__mb_cache_entry_unhash_index(ce);
+	hlist_bl_unlock(ce->e_index_hash_p);
+	hlist_bl_lock(ce->e_block_hash_p);
+	__mb_cache_entry_unhash_block(ce);
+	hlist_bl_unlock(ce->e_block_hash_p);
+	__mb_cache_entry_release(ce);
 }
 
 
@@ -497,39 +660,48 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
 			   cache->c_bucket_bits);
 	block_hash_p = &cache->c_block_hash[bucket];
-	spin_lock(&mb_cache_spinlock);
+	/* First serialize access to the block corresponding hash chain. */
+	hlist_bl_lock(block_hash_p);
 	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
 		mb_assert(ce->e_block_hash_p == block_hash_p);
 		if (ce->e_bdev == bdev && ce->e_block == block) {
-			DEFINE_WAIT(wait);
+			/*
+			 * Prevent a free from removing the entry.
+			 */
+			atomic_inc(&ce->e_refcnt);
+			hlist_bl_unlock(block_hash_p);
+			__spin_lock_mb_cache_entry(ce);
+			atomic_dec(&ce->e_refcnt);
+			if (ce->e_used > 0) {
+				DEFINE_WAIT(wait);
+				while (ce->e_used > 0) {
+					ce->e_queued++;
+					prepare_to_wait(&mb_cache_queue, &wait,
+							TASK_UNINTERRUPTIBLE);
+					__spin_unlock_mb_cache_entry(ce);
+					schedule();
+					__spin_lock_mb_cache_entry(ce);
+					ce->e_queued--;
+				}
+				finish_wait(&mb_cache_queue, &wait);
+			}
+			ce->e_used += 1 + MB_CACHE_WRITER;
+			__spin_unlock_mb_cache_entry(ce);
 
-			if (!list_empty(&ce->e_lru_list))
+			if (!list_empty(&ce->e_lru_list)) {
+				spin_lock(&mb_cache_spinlock);
 				list_del_init(&ce->e_lru_list);
-
-			while (ce->e_used > 0) {
-				ce->e_queued++;
-				prepare_to_wait(&mb_cache_queue, &wait,
-						TASK_UNINTERRUPTIBLE);
 				spin_unlock(&mb_cache_spinlock);
-				schedule();
-				spin_lock(&mb_cache_spinlock);
-				ce->e_queued--;
 			}
-			finish_wait(&mb_cache_queue, &wait);
-			ce->e_used += 1 + MB_CACHE_WRITER;
-
 			if (!__mb_cache_entry_is_block_hashed(ce)) {
-				__mb_cache_entry_release_unlock(ce);
+				__mb_cache_entry_release(ce);
 				return NULL;
 			}
-			goto cleanup;
+			return ce;
 		}
 	}
-	ce = NULL;
-
-cleanup:
-	spin_unlock(&mb_cache_spinlock);
-	return ce;
+	hlist_bl_unlock(block_hash_p);
+	return NULL;
 }
 
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
@@ -538,40 +710,53 @@ static struct mb_cache_entry *
 __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
 		      struct block_device *bdev, unsigned int key)
 {
+
+	/* The index hash chain is alredy acquire by caller. */
 	while (l != NULL) {
 		struct mb_cache_entry *ce =
 			hlist_bl_entry(l, struct mb_cache_entry,
 				e_index.o_list);
 		mb_assert(ce->e_index_hash_p == head);
 		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-			DEFINE_WAIT(wait);
-
-			if (!list_empty(&ce->e_lru_list))
-				list_del_init(&ce->e_lru_list);
-
+			/*
+			 * Prevent a free from removing the entry.
+			 */
+			atomic_inc(&ce->e_refcnt);
+			hlist_bl_unlock(head);
+			__spin_lock_mb_cache_entry(ce);
+			atomic_dec(&ce->e_refcnt);
+			ce->e_used++;
 			/* Incrementing before holding the lock gives readers
 			   priority over writers. */
-			ce->e_used++;
-			while (ce->e_used >= MB_CACHE_WRITER) {
-				ce->e_queued++;
-				prepare_to_wait(&mb_cache_queue, &wait,
-						TASK_UNINTERRUPTIBLE);
-				spin_unlock(&mb_cache_spinlock);
-				schedule();
+			if (ce->e_used >= MB_CACHE_WRITER) {
+				DEFINE_WAIT(wait);
+
+				while (ce->e_used >= MB_CACHE_WRITER) {
+					ce->e_queued++;
+					prepare_to_wait(&mb_cache_queue, &wait,
+							TASK_UNINTERRUPTIBLE);
+					__spin_unlock_mb_cache_entry(ce);
+					schedule();
+					__spin_lock_mb_cache_entry(ce);
+					ce->e_queued--;
+				}
+				finish_wait(&mb_cache_queue, &wait);
+			}
+			__spin_unlock_mb_cache_entry(ce);
+			if (!list_empty(&ce->e_lru_list)) {
 				spin_lock(&mb_cache_spinlock);
-				ce->e_queued--;
+				list_del_init(&ce->e_lru_list);
+				spin_unlock(&mb_cache_spinlock);
 			}
-			finish_wait(&mb_cache_queue, &wait);
-
 			if (!__mb_cache_entry_is_block_hashed(ce)) {
-				__mb_cache_entry_release_unlock(ce);
-				spin_lock(&mb_cache_spinlock);
+				__mb_cache_entry_release(ce);
 				return ERR_PTR(-EAGAIN);
 			}
 			return ce;
 		}
 		l = l->next;
 	}
+	hlist_bl_unlock(head);
 	return NULL;
 }
 
@@ -598,12 +783,12 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
 	struct hlist_bl_head *index_hash_p;
 
 	index_hash_p = &cache->c_index_hash[bucket];
-	spin_lock(&mb_cache_spinlock);
+	hlist_bl_lock(index_hash_p);
 	if (!hlist_bl_empty(index_hash_p)) {
 		l = hlist_bl_first(index_hash_p);
 		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-	}
-	spin_unlock(&mb_cache_spinlock);
+	} else
+		hlist_bl_unlock(index_hash_p);
 	return ce;
 }
 
@@ -638,11 +823,11 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
 
 	index_hash_p = &cache->c_index_hash[bucket];
 	mb_assert(prev->e_index_hash_p == index_hash_p);
-	spin_lock(&mb_cache_spinlock);
+	hlist_bl_lock(index_hash_p);
 	mb_assert(!hlist_bl_empty(index_hash_p));
 	l = prev->e_index.o_list.next;
 	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-	__mb_cache_entry_release_unlock(prev);
+	__mb_cache_entry_release(prev);
 	return ce;
 }
 
-- 
cgit v0.10.2


From 9c191f701ce9f9bc604e88a5dc69cd943daa5d3b Mon Sep 17 00:00:00 2001
From: T Makphaibulchoke <tmac@hp.com>
Date: Tue, 18 Mar 2014 19:24:49 -0400
Subject: ext4: each filesystem creates and uses its own mb_cache

This patch adds new interfaces to create and destory cache,
ext4_xattr_create_cache() and ext4_xattr_destroy_cache(), and remove
the cache creation and destory calls from ex4_init_xattr() and
ext4_exitxattr() in fs/ext4/xattr.c.

fs/ext4/super.c has been changed so that when a filesystem is mounted
a cache is allocated and attched to its ext4_sb_info structure.

fs/mbcache.c has been changed so that only one slab allocator is
allocated and used by all mbcache structures.

Signed-off-by: T. Makphaibulchoke <tmac@hp.com>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1b3cbf8..f4f889e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1329,6 +1329,7 @@ struct ext4_sb_info {
 	struct list_head s_es_lru;
 	unsigned long s_es_last_sorted;
 	struct percpu_counter s_extent_cache_cnt;
+	struct mb_cache *s_mb_cache;
 	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 89baee4..5a51af7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
 static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mb_cache) {
+		ext4_xattr_destroy_cache(sbi->s_mb_cache);
+		sbi->s_mb_cache = NULL;
+	}
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
@@ -4010,6 +4015,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 
 no_journal:
+	if (ext4_mballoc_ready) {
+		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+		if (!sbi->s_mb_cache) {
+			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+			goto failed_mount_wq;
+		}
+	}
+
 	/*
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
@@ -5519,11 +5532,9 @@ static int __init ext4_init_fs(void)
 
 	err = ext4_init_mballoc();
 	if (err)
-		goto out3;
-
-	err = ext4_init_xattr();
-	if (err)
 		goto out2;
+	else
+		ext4_mballoc_ready = 1;
 	err = init_inodecache();
 	if (err)
 		goto out1;
@@ -5539,10 +5550,9 @@ out:
 	unregister_as_ext3();
 	destroy_inodecache();
 out1:
-	ext4_exit_xattr();
-out2:
+	ext4_mballoc_ready = 0;
 	ext4_exit_mballoc();
-out3:
+out2:
 	ext4_exit_feat_adverts();
 out4:
 	if (ext4_proc_root)
@@ -5565,7 +5575,6 @@ static void __exit ext4_exit_fs(void)
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
-	ext4_exit_xattr();
 	ext4_exit_mballoc();
 	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 185066f..1f5cf58 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
 # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
 static struct buffer_head *ext4_xattr_cache_find(struct inode *,
 						 struct ext4_xattr_header *,
 						 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 			   size_t buffer_size);
 
-static struct mb_cache *ext4_xattr_cache;
-
 static const struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 	NULL
 };
 
+#define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_mb_cache)
+
 static __le32 ext4_xattr_block_csum(struct inode *inode,
 				    sector_t block_nr,
 				    struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	struct ext4_xattr_entry *entry;
 	size_t size;
 	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
 		error = -EIO;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(bh);
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
 	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
 	if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh = NULL;
 	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		error = -EIO;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(bh);
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 
 cleanup:
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 {
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
-	ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
 	error = ext4_journal_get_write_access(handle, bh);
 	if (error)
 		goto out;
@@ -746,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_search *s = &bs->s;
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (i->value && i->value_len > sb->s_blocksize)
 		return -ENOSPC;
 	if (s->base) {
-		ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
 					bs->bh->b_blocknr);
 		error = ext4_journal_get_write_access(handle, bs->bh);
 		if (error)
@@ -770,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
 							  s->here);
-				ext4_xattr_cache_insert(bs->bh);
+				ext4_xattr_cache_insert(ext4_mb_cache,
+					bs->bh);
 			}
 			unlock_buffer(bs->bh);
 			if (error == -EIO)
@@ -906,7 +912,7 @@ getblk_failed:
 			memcpy(new_bh->b_data, s->base, new_bh->b_size);
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
-			ext4_xattr_cache_insert(new_bh);
+			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
 			error = ext4_handle_dirty_xattr_block(handle,
 							      inode, new_bh);
 			if (error)
@@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb)
  * Returns 0, or a negative error number on failure.
  */
 static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
 {
 	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
 	struct mb_cache_entry *ce;
 	int error;
 
-	ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
 	if (!ce) {
 		ea_bdebug(bh, "out of memory");
 		return;
@@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
 	struct mb_cache_entry *ce;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
 				       hash);
 	while (ce) {
 		struct buffer_head *bh;
@@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 
 #undef BLOCK_HASH_SHIFT
 
-int __init
-ext4_init_xattr(void)
+#define	HASH_BUCKET_BITS	10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
 {
-	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
-	if (!ext4_xattr_cache)
-		return -ENOMEM;
-	return 0;
+	return mb_cache_create(name, HASH_BUCKET_BITS);
 }
 
-void
-ext4_exit_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
 {
-	if (ext4_xattr_cache)
-		mb_cache_destroy(ext4_xattr_cache);
-	ext4_xattr_cache = NULL;
+	if (cache)
+		mb_cache_destroy(cache);
 }
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d639..29bedf5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
-
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 				       struct ext4_xattr_info *i,
 				       struct ext4_xattr_ibody_find *is);
 
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
+
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 			      struct inode *dir, const struct qstr *qstr);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 786ecab..bf166e3 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -99,6 +99,7 @@
 
 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
 static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
 
 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
@@ -351,11 +352,14 @@ mb_cache_create(const char *name, int bucket_bits)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
 		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
-	cache->c_entry_cache = kmem_cache_create(name,
-		sizeof(struct mb_cache_entry), 0,
-		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-	if (!cache->c_entry_cache)
-		goto fail2;
+	if (!mb_cache_kmem_cache) {
+		mb_cache_kmem_cache = kmem_cache_create(name,
+			sizeof(struct mb_cache_entry), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+		if (!mb_cache_kmem_cache)
+			goto fail2;
+	}
+	cache->c_entry_cache = mb_cache_kmem_cache;
 
 	/*
 	 * Set an upper limit on the number of cache entries so that the hash
@@ -476,6 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
 			  atomic_read(&cache->c_entry_count));
 	}
 
+	if (list_empty(&mb_cache_list)) {
+		kmem_cache_destroy(mb_cache_kmem_cache);
+		mb_cache_kmem_cache = NULL;
+	}
 	kfree(cache->c_index_hash);
 	kfree(cache->c_block_hash);
 	kfree(cache);
-- 
cgit v0.10.2


From c4f65706056e9f0c2cf126b29c6920a179d91150 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 20 Mar 2014 00:32:57 -0400
Subject: ext4: kill i_version support for Hurd-castrated file systems

The Hurd file system uses uses the inode field which is now used for
i_version for its translator block.  This means that ext2 file systems
that are formatted for GNU Hurd can't be used to support NFSv4.  Given
that Hurd file systems don't support extents, and a huge number of
modern file system features, this is no great loss.

If we don't do this, the attempt to update the i_version field will
stomp over the translator block field, which will cause file system
corruption for Hurd file systems.  This can be replicated via:

mke2fs -t ext2 -o hurd /dev/vdc
mount -t ext4 /dev/vdc /vdc
touch /vdc/bug0000
umount /dev/vdc
e2fsck -f /dev/vdc

Addresses-Debian-Bug: #738758

Reported-By: Gabriele Giacone <1o5g4r8o@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7cc2455..ed2c13a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4168,11 +4168,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
-	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-			inode->i_version |=
-			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_HURD)) {
+		inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				inode->i_version |=
+		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+		}
 	}
 
 	ret = 0;
@@ -4388,12 +4391,16 @@ static int ext4_do_update_inode(handle_t *handle,
 			raw_inode->i_block[block] = ei->i_data[block];
 	}
 
-	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-	if (ei->i_extra_isize) {
-		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-			raw_inode->i_version_hi =
-			cpu_to_le32(inode->i_version >> 32);
-		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_HURD)) {
+		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+		if (ei->i_extra_isize) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				raw_inode->i_version_hi =
+					cpu_to_le32(inode->i_version >> 32);
+			raw_inode->i_extra_isize =
+				cpu_to_le16(ei->i_extra_isize);
+		}
 	}
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
-- 
cgit v0.10.2


From ed3654eb981fd44694b4d2a636e13f998bc10e7f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 24 Mar 2014 14:09:06 -0400
Subject: ext4: optimize Hurd tests when reading/writing inodes

Set a in-memory superblock flag to indicate whether the file system is
designed to support the Hurd.

Also, add a sanity check to make sure the 64-bit feature is not set
for Hurd file systems, since i_file_acl_high conflicts with a
Hurd-specific field.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f4f889e..e01135d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1001,6 +1001,8 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
 						      size of blocksize * 8
 						      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
+						      file systems */
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ed2c13a..b5e182a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4168,8 +4168,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD)) {
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
 		inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
 		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
@@ -4345,8 +4344,7 @@ static int ext4_do_update_inode(handle_t *handle,
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD))
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4391,8 +4389,7 @@ static int ext4_do_update_inode(handle_t *handle,
 			raw_inode->i_block[block] = ei->i_data[block];
 	}
 
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD)) {
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
 		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 		if (ei->i_extra_isize) {
 			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5a51af7..f3c6670 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3580,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended");
 
+	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+		set_opt2(sb, HURD_COMPAT);
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "The Hurd can't support 64-bit file systems");
+			goto failed_mount;
+		}
+	}
+
 	if (IS_EXT2_SB(sb)) {
 		if (ext2_feature_set_ok(sb))
 			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
-- 
cgit v0.10.2


From 5f16f3225b06242a9ee876f07c1c9b6ed36a22b6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 24 Mar 2014 14:43:12 -0400
Subject: ext4: atomically set inode->i_flags in ext4_set_inode_flags()

Use cmpxchg() to atomically set i_flags instead of clearing out the
S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the
EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race
where an immutable file has the immutable flag cleared for a brief
window of time.

Reported-by: John Sullivan <jsrhbz@kanargh.force9.co.uk>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b5e182a..df067c3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3938,18 +3938,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
+	unsigned int new_fl = 0;
 
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT4_SYNC_FL)
-		inode->i_flags |= S_SYNC;
+		new_fl |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
-		inode->i_flags |= S_APPEND;
+		new_fl |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
-		inode->i_flags |= S_IMMUTABLE;
+		new_fl |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
-		inode->i_flags |= S_NOATIME;
+		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
-		inode->i_flags |= S_DIRSYNC;
+		new_fl |= S_DIRSYNC;
+	inode_set_flags(inode, new_fl,
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 }
 
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
diff --git a/fs/inode.c b/fs/inode.c
index 4bcdad3..26f95ce 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1899,3 +1899,34 @@ void inode_dio_done(struct inode *inode)
 		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 EXPORT_SYMBOL(inode_dio_done);
+
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated).  The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+		     unsigned int mask)
+{
+	unsigned int old_flags, new_flags;
+
+	WARN_ON_ONCE(flags & ~mask);
+	do {
+		old_flags = ACCESS_ONCE(inode->i_flags);
+		new_flags = (old_flags & ~mask) | flags;
+	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+				  new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6082956..5d1f6fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2556,6 +2556,9 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 void inode_dio_wait(struct inode *inode);
 void inode_dio_done(struct inode *inode);
 
+extern void inode_set_flags(struct inode *inode, unsigned int flags,
+			    unsigned int mask);
+
 extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-- 
cgit v0.10.2


From 94350ab5c34166f08ef67aaca3a01e6b420891c9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 24 Mar 2014 15:09:16 -0400
Subject: ext4: make ext4_block_zero_page_range static

It's only called within inode.c, so make it static, remove its prototype
from ext4.h and move it above all of its callers so it doesn't need a
prototype within inode.c.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e01135d..f1c65dc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2139,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index df067c3..f03a9e7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3323,33 +3323,13 @@ void ext4_set_aops(struct inode *inode)
 }
 
 /*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-		struct address_space *mapping, loff_t from)
-{
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length;
-	unsigned blocksize;
-	struct inode *inode = mapping->host;
-
-	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
-
-	return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
  * ext4_block_zero_page_range() zeros out a mapping of length 'length'
  * starting from file offset 'from'.  The range to be zero'd must
  * be contained with in one block.  If the specified range exceeds
  * the end of the block it will be shortened to end of the block
  * that cooresponds to 'from'
  */
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3439,6 +3419,26 @@ unlock:
 	return err;
 }
 
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t length)
 {
-- 
cgit v0.10.2


From e04027e887c37b670e30a3f29fde8bfbeba56abc Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 24 Mar 2014 15:15:07 -0400
Subject: ext4: fix comment typo

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f03a9e7..0171c19 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3698,7 +3698,7 @@ void ext4_truncate(struct inode *inode)
 
 	/*
 	 * There is a possibility that we're either freeing the inode
-	 * or it completely new indode. In those cases we might not
+	 * or it's a completely new inode. In those cases we might not
 	 * have i_mutex locked because it's not necessary.
 	 */
 	if (!(inode->i_state & (I_NEW|I_FREEING)))
-- 
cgit v0.10.2


From e5b30416f3631bf4eed37c3bec9f789f9ae78446 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 1 Apr 2014 00:59:21 -0400
Subject: ext4: remove unneeded test of ret variable

Currently in ext4_fallocate() and ext4_zero_range() we're testing ret
variable along with new_size. However in ext4_fallocate() we just tested
ret before and in ext4_zero_range() if will always be zero when we get
there so there is no need to test it in both cases.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 464e95d..c92ef87 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4816,12 +4816,12 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 
-	if (!ret && new_size) {
+	if (new_size) {
 		if (new_size > i_size_read(inode))
 			i_size_write(inode, new_size);
 		if (new_size > EXT4_I(inode)->i_disksize)
 			ext4_update_i_disksize(inode, new_size);
-	} else if (!ret && !new_size) {
+	} else {
 		/*
 		* Mark that we allocate beyond EOF so the subsequent truncate
 		* can proceed even if the new size is the same as i_size.
@@ -4923,14 +4923,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	tv = inode->i_ctime = ext4_current_time(inode);
 
-	if (!ret && new_size) {
+	if (new_size) {
 		if (new_size > i_size_read(inode)) {
 			i_size_write(inode, new_size);
 			inode->i_mtime = tv;
 		}
 		if (new_size > EXT4_I(inode)->i_disksize)
 			ext4_update_i_disksize(inode, new_size);
-	} else if (!ret && !new_size) {
+	} else {
 		/*
 		* Mark that we allocate beyond EOF so the subsequent truncate
 		* can proceed even if the new size is the same as i_size.
-- 
cgit v0.10.2


From ad6599ab3ac98a4474544086e048ce86ec15a4d1 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Tue, 1 Apr 2014 19:49:30 -0400
Subject: ext4: fix premature freeing of partial clusters split across leaf
 blocks

Xfstests generic/311 and shared/298 fail when run on a bigalloc file
system.  Kernel error messages produced during the tests report that
blocks to be freed are already on the to-be-freed list.  When e2fsck
is run at the end of the tests, it typically reports bad i_blocks and
bad free blocks counts.

The bug that causes these failures is located in ext4_ext_rm_leaf().
Code at the end of the function frees a partial cluster if it's not
shared with an extent remaining in the leaf.  However, if all the
extents in the leaf have been removed, the code dereferences an
invalid extent pointer (off the front of the leaf) when the check for
sharing is made.  This generally has the effect of unconditionally
freeing the partial cluster, which leads to the observed failures
when the partial cluster is shared with the last extent in the next
leaf.

Fix this by attempting to free the cluster only if extents remain in
the leaf.  Any remaining partial cluster will be freed if possible
when the next leaf is processed or when leaf removal is complete.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c92ef87..82df3ce 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2743,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
 	/*
-	 * Free the partial cluster only if the current extent does not
-	 * reference it. Otherwise we might free used cluster.
+	 * If there's a partial cluster and at least one extent remains in
+	 * the leaf, free the partial cluster if it isn't shared with the
+	 * current extent.  If there's a partial cluster and no extents
+	 * remain in the leaf, it can't be freed here.  It can only be
+	 * freed when it's possible to determine if it's not shared with
+	 * any other extent - when the next leaf is processed or when space
+	 * removal is complete.
 	 */
-	if (*partial_cluster > 0 &&
+	if (*partial_cluster > 0 && eh->eh_entries &&
 	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
 	     *partial_cluster)) {
 		int flags = get_default_free_blocks_flags(inode);
-- 
cgit v0.10.2