From 8018ec083c72443cc74fd2d08eb7c5dddc13af53 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 9 Sep 2014 11:44:46 +1000
Subject: xfs: mark all internal workqueues as freezable

Workqueues must be explicitly set as freezable to ensure they are frozen
in the assocated part of the hibernation/suspend sequence. Freezing of
workqueues and kernel threads is important to ensure that modifications
are not made on-disk after the hibernation image has been created.
Otherwise, the in-memory state can become inconsistent with what is on
disk and eventually lead to filesystem corruption. We have reports of
free space btree corruptions that occur immediately after restore from
hibernate that suggest the xfs-eofblocks workqueue could be causing
such problems if it races with hibernation.

Mark all of the internal XFS workqueues as freezable to ensure nothing
changes on-disk once the freezer infrastructure freezes kernel threads
and creates the hibernation image.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reported-by: Carlos E. R. <carlos.e.r@opensuse.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cd7b8ca..ec65050 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1884,7 +1884,7 @@ xfs_buf_init(void)
 		goto out;
 
 	xfslogd_workqueue = alloc_workqueue("xfslogd",
-					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+				WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
 	if (!xfslogd_workqueue)
 		goto out_free_buf_zone;
 
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 1eb6f3d..30ecca3 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -304,7 +304,8 @@ _xfs_mru_cache_reap(
 int
 xfs_mru_cache_init(void)
 {
-	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
+	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
+				WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
 	if (!xfs_mru_reap_wq)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b194652..bc9ec44 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -838,32 +838,32 @@ xfs_init_mount_workqueues(
 	struct xfs_mount	*mp)
 {
 	mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
-			WQ_MEM_RECLAIM, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_data_workqueue)
 		goto out;
 
 	mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
-			WQ_MEM_RECLAIM, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_unwritten_workqueue)
 		goto out_destroy_data_iodone_queue;
 
 	mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-			WQ_MEM_RECLAIM, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
 
 	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-			0, 0, mp->m_fsname);
+			WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_reclaim_workqueue)
 		goto out_destroy_cil;
 
 	mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-			0, 0, mp->m_fsname);
+			WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_log_workqueue)
 		goto out_destroy_reclaim;
 
 	mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
-			0, 0, mp->m_fsname);
+			WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_eofblocks_workqueue)
 		goto out_destroy_log;
 
@@ -1715,7 +1715,8 @@ xfs_init_workqueues(void)
 	 * AGs in all the filesystems mounted. Hence use the default large
 	 * max_active value for this workqueue.
 	 */
-	xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+	xfs_alloc_wq = alloc_workqueue("xfsalloc",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
 	if (!xfs_alloc_wq)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From e1b05723ed834090caab56866adc05bce31c9bdd Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:47:24 +1000
Subject: xfs: add a few more verifier tests

These were exposed by fsfuzzer runs; without them we fail
in various exciting and sometimes convoluted ways when we
encounter disk corruption.

Without the MAXLEVELS tests we tend to walk off the end of
an array in a loop like this:

        for (i = 0; i < cur->bc_nlevels; i++) {
                if (cur->bc_bufs[i])

Without the dirblklog test we try to allocate more memory
than we could possibly hope for and loop forever:

xfs_dabuf_map()
	nfsb = mp->m_dir_geo->fsbcount;
	irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP...

As for the logbsize check, that's the convoluted one.

If logbsize is specified at mount time, it's sanitized
in xfs_parseargs; in particular it makes sure that it's
not > XLOG_MAX_RECORD_BSIZE.

If not specified at mount time, it comes from the superblock
via sb_logsunit; this is limited to 256k at mkfs time as well;
it's copied into m_logbsize in xfs_finish_flags().

However, if for some reason the on-disk value is corrupt and
too large, nothing catches it.  It's a circuitous path, but
that size eventually finds its way to places that make the kernel
very unhappy, leading to oopses in xlog_pack_data() because we
use the size as an index into iclog->ic_data, but the array
is not necessarily that big.

Anyway - bounds checking when we read from disk is a good thing!

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 4bffffe..eff3421 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2209,6 +2209,10 @@ xfs_agf_verify(
 	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
 		return false;
 
+	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
+		return false;
+
 	/*
 	 * during growfs operations, the perag is not fully initialised,
 	 * so we can't use it for any useful checking. growfs ensures we can't
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b62771f..d213a2e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2051,6 +2051,8 @@ xfs_agi_verify(
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 		return false;
 
+	if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+		return false;
 	/*
 	 * during growfs operations, the perag is not fully initialised,
 	 * so we can't use it for any useful checking. growfs ensures we can't
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ad525a5..8426e5e 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -279,11 +279,13 @@ xfs_mount_validate_sb(
 	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
 	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
+	    sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG			||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
 	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
 	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
 	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
+	    sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE			||
 	    sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
-- 
cgit v0.10.2


From 65b65735fede29b516fed1d8c2391e8bc373b805 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 9 Sep 2014 11:52:42 +1000
Subject: xfs: add debug sysfs attribute set

Create a top-level debug directory for global debug sysfs attributes.
This directory is added and removed on XFS module initialization and
removal respectively for DEBUG mode kernels only. It typically resides
at /sys/fs/xfs/debug. It is located at the top level of the xfs sysfs
hierarchy as attributes might define global behavior or behavior that
must be configured before an xfs mount is available (e.g., log recovery
behavior).

Define the global debug kobject that represents the debug sysfs
directory and add generic attribute show/store helpers to support future
attributes. No debug attributes are exported as of yet.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bc9ec44..dcd4b93 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,7 @@
 #include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_quota.h"
+#include "xfs_sysfs.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -61,7 +62,11 @@
 static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
-struct kset *xfs_kset;
+
+struct kset *xfs_kset;			/* top-level xfs sysfs dir */
+#ifdef DEBUG
+static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
+#endif
 
 #define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE	"logbsize"	/* size of XFS log buffers */
@@ -1769,9 +1774,16 @@ init_xfs_fs(void)
 		goto out_sysctl_unregister;;
 	}
 
-	error = xfs_qm_init();
+#ifdef DEBUG
+	xfs_dbg_kobj.kobject.kset = xfs_kset;
+	error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
 	if (error)
 		goto out_kset_unregister;
+#endif
+
+	error = xfs_qm_init();
+	if (error)
+		goto out_remove_kobj;
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
@@ -1780,7 +1792,11 @@ init_xfs_fs(void)
 
  out_qm_exit:
 	xfs_qm_exit();
+ out_remove_kobj:
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
  out_kset_unregister:
+#endif
 	kset_unregister(xfs_kset);
  out_sysctl_unregister:
 	xfs_sysctl_unregister();
@@ -1803,6 +1819,9 @@ exit_xfs_fs(void)
 {
 	xfs_qm_exit();
 	unregister_filesystem(&xfs_fs_type);
+#ifdef DEBUG
+	xfs_sysfs_del(&xfs_dbg_kobj);
+#endif
 	kset_unregister(xfs_kset);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 9835139..32ddf0c 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -51,6 +51,49 @@ struct kobj_type xfs_mp_ktype = {
 	.release = xfs_sysfs_release,
 };
 
+#ifdef DEBUG
+/* debug */
+
+static struct attribute *xfs_dbg_attrs[] = {
+	NULL,
+};
+
+STATIC ssize_t
+xfs_dbg_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+}
+
+STATIC ssize_t
+xfs_dbg_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+}
+
+static struct sysfs_ops xfs_dbg_ops = {
+	.show = xfs_dbg_show,
+	.store = xfs_dbg_store,
+};
+
+struct kobj_type xfs_dbg_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_dbg_ops,
+	.default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
 /* xlog */
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 54a2091..240eee3 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -20,6 +20,7 @@
 #define __XFS_SYSFS_H__
 
 extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
+extern struct kobj_type xfs_dbg_ktype;	/* debug */
 extern struct kobj_type xfs_log_ktype;	/* xlog */
 
 static inline struct xfs_kobj *
-- 
cgit v0.10.2


From 2e2271787419a12496bf5da5c3028a9c73c9697f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 9 Sep 2014 11:56:13 +1000
Subject: xfs: export log_recovery_delay to delay mount time log recovery

XFS log recovery has been discovered to have race conditions with
buffers when I/O errors occur. External tools are available to simulate
I/O errors to XFS, but this alone is not sufficient for testing log
recovery. XFS unconditionally resets the inactive region of the log
prior to log recovery to avoid confusion over processing any partially
written log records that might have been written before an unclean
shutdown. Therefore, unconditional write I/O failures at mount time are
caught by the reset sequence rather than log recovery and hinder the
ability to test the latter.

The device-mapper dm-flakey module uses an up/down timer to define a
cycle for when to fail I/Os. Create a pre log recovery delay tunable
that can be used to coordinate XFS log recovery with I/O errors
simulated by dm-flakey. This facilitates coordination in userspace that
allows the reset of stale log blocks to succeed and writes due to log
recovery to fail. For example, define a dm-flakey instance with an
uptime long enough to allow log reset to succeed and a log recovery
delay long enough to allow the dm-flakey uptime to expire.

The 'log_recovery_delay' sysfs tunable is exported under
/sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug
mode. The value is exported in units of seconds and allows for a delay
of up to 60 seconds. Note that this is for XFS debug and test
instrumentation purposes only and should not be used by applications. No
delay is enabled by default.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5399ef2..4d41b24 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,3 +43,7 @@ xfs_param_t xfs_params = {
 	.fstrm_timer	= {	1,		30*100,		3600*100},
 	.eofb_timer	= {	1,		300,		3600*24},
 };
+
+struct xfs_globals xfs_globals = {
+	.log_recovery_delay	=	0,	/* no delay by default */
+};
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fd5787..176c4b3 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4509,6 +4509,18 @@ xlog_recover(
 			return -EINVAL;
 		}
 
+		/*
+		 * Delay log recovery if the debug hook is set. This is debug
+		 * instrumention to coordinate simulation of I/O failures with
+		 * log recovery.
+		 */
+		if (xfs_globals.log_recovery_delay) {
+			xfs_notice(log->l_mp,
+				"Delaying log recovery for %d seconds.",
+				xfs_globals.log_recovery_delay);
+			msleep(xfs_globals.log_recovery_delay * 1000);
+		}
+
 		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
 				log->l_mp->m_logname ? log->l_mp->m_logname
 						     : "internal");
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index bd8e157..ffef453 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -92,6 +92,11 @@ enum {
 
 extern xfs_param_t	xfs_params;
 
+struct xfs_globals {
+	int	log_recovery_delay;	/* log recovery delay (secs) */
+};
+extern struct xfs_globals	xfs_globals;
+
 #ifdef CONFIG_SYSCTL
 extern int xfs_sysctl_register(void);
 extern void xfs_sysctl_unregister(void);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 32ddf0c..aa03670 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -54,7 +54,38 @@ struct kobj_type xfs_mp_ktype = {
 #ifdef DEBUG
 /* debug */
 
+STATIC ssize_t
+log_recovery_delay_store(
+	const char	*buf,
+	size_t		count,
+	void		*data)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 60)
+		return -EINVAL;
+
+	xfs_globals.log_recovery_delay = val;
+
+	return count;
+}
+
+STATIC ssize_t
+log_recovery_delay_show(
+	char	*buf,
+	void	*data)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+}
+XFS_SYSFS_ATTR_RW(log_recovery_delay);
+
 static struct attribute *xfs_dbg_attrs[] = {
+	ATTR_LIST(log_recovery_delay),
 	NULL,
 };
 
-- 
cgit v0.10.2


From 49c69591c80648c14ff87525e97ee6ebe3a343cb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:56:48 +1000
Subject: xfs: combine xfs_seek_hole & xfs_seek_data

xfs_seek_hole & xfs_seek_data are remarkably similar;
so much so that they can be combined, saving a fair
bit of semi-complex code duplication.

The following patch passes generic/285 and generic/286,
which specifically test seek behavior.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b170..1da3b7d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -964,7 +964,7 @@ xfs_vm_page_mkwrite(
 
 /*
  * This type is designed to indicate the type of offset we would like
- * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ * to search from page cache for xfs_seek_hole_data().
  */
 enum {
 	HOLE_OFF = 0,
@@ -1021,7 +1021,7 @@ xfs_lookup_buffer_offset(
 /*
  * This routine is called to find out and return a data or hole offset
  * from the page cache for unwritten extents according to the desired
- * type for xfs_seek_data() or xfs_seek_hole().
+ * type for xfs_seek_hole_data().
  *
  * The argument offset is used to tell where we start to search from the
  * page cache.  Map is used to figure out the end points of the range to
@@ -1181,9 +1181,10 @@ out:
 }
 
 STATIC loff_t
-xfs_seek_data(
+xfs_seek_hole_data(
 	struct file		*file,
-	loff_t			start)
+	loff_t			start,
+	int			whence)
 {
 	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1195,6 +1196,9 @@ xfs_seek_data(
 	uint			lock;
 	int			error;
 
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
 	lock = xfs_ilock_data_map_shared(ip);
 
 	isize = i_size_read(inode);
@@ -1209,6 +1213,7 @@ xfs_seek_data(
 	 */
 	fsbno = XFS_B_TO_FSBT(mp, start);
 	end = XFS_B_TO_FSB(mp, isize);
+
 	for (;;) {
 		struct xfs_bmbt_irec	map[2];
 		int			nmap = 2;
@@ -1229,29 +1234,48 @@ xfs_seek_data(
 			offset = max_t(loff_t, start,
 				       XFS_FSB_TO_B(mp, map[i].br_startoff));
 
-			/* Landed in a data extent */
-			if (map[i].br_startblock == DELAYSTARTBLOCK ||
-			    (map[i].br_state == XFS_EXT_NORM &&
-			     !isnullstartblock(map[i].br_startblock)))
+			/* Landed in the hole we wanted? */
+			if (whence == SEEK_HOLE &&
+			    map[i].br_startblock == HOLESTARTBLOCK)
+				goto out;
+
+			/* Landed in the data extent we wanted? */
+			if (whence == SEEK_DATA &&
+			    (map[i].br_startblock == DELAYSTARTBLOCK ||
+			     (map[i].br_state == XFS_EXT_NORM &&
+			      !isnullstartblock(map[i].br_startblock))))
 				goto out;
 
 			/*
-			 * Landed in an unwritten extent, try to search data
-			 * from page cache.
+			 * Landed in an unwritten extent, try to search
+			 * for hole or data from page cache.
 			 */
 			if (map[i].br_state == XFS_EXT_UNWRITTEN) {
 				if (xfs_find_get_desired_pgoff(inode, &map[i],
-							DATA_OFF, &offset))
+				      whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
+							&offset))
 					goto out;
 			}
 		}
 
 		/*
-		 * map[0] is hole or its an unwritten extent but
-		 * without data in page cache.  Probably means that
-		 * we are reading after EOF if nothing in map[1].
+		 * We only received one extent out of the two requested. This
+		 * means we've hit EOF and didn't find what we are looking for.
 		 */
 		if (nmap == 1) {
+			/*
+			 * If we were looking for a hole, set offset to
+			 * the end of the file (i.e., there is an implicit
+			 * hole at the end of any file).
+		 	 */
+			if (whence == SEEK_HOLE) {
+				offset = isize;
+				break;
+			}
+			/*
+			 * If we were looking for data, it's nowhere to be found
+			 */
+			ASSERT(whence == SEEK_DATA);
 			error = -ENXIO;
 			goto out_unlock;
 		}
@@ -1260,125 +1284,30 @@ xfs_seek_data(
 
 		/*
 		 * Nothing was found, proceed to the next round of search
-		 * if reading offset not beyond or hit EOF.
+		 * if the next reading offset is not at or beyond EOF.
 		 */
 		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
 		start = XFS_FSB_TO_B(mp, fsbno);
 		if (start >= isize) {
+			if (whence == SEEK_HOLE) {
+				offset = isize;
+				break;
+			}
+			ASSERT(whence == SEEK_DATA);
 			error = -ENXIO;
 			goto out_unlock;
 		}
 	}
 
 out:
-	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-
-out_unlock:
-	xfs_iunlock(ip, lock);
-
-	if (error)
-		return error;
-	return offset;
-}
-
-STATIC loff_t
-xfs_seek_hole(
-	struct file		*file,
-	loff_t			start)
-{
-	struct inode		*inode = file->f_mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	loff_t			uninitialized_var(offset);
-	xfs_fsize_t		isize;
-	xfs_fileoff_t		fsbno;
-	xfs_filblks_t		end;
-	uint			lock;
-	int			error;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	lock = xfs_ilock_data_map_shared(ip);
-
-	isize = i_size_read(inode);
-	if (start >= isize) {
-		error = -ENXIO;
-		goto out_unlock;
-	}
-
-	fsbno = XFS_B_TO_FSBT(mp, start);
-	end = XFS_B_TO_FSB(mp, isize);
-
-	for (;;) {
-		struct xfs_bmbt_irec	map[2];
-		int			nmap = 2;
-		unsigned int		i;
-
-		error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
-				       XFS_BMAPI_ENTIRE);
-		if (error)
-			goto out_unlock;
-
-		/* No extents at given offset, must be beyond EOF */
-		if (nmap == 0) {
-			error = -ENXIO;
-			goto out_unlock;
-		}
-
-		for (i = 0; i < nmap; i++) {
-			offset = max_t(loff_t, start,
-				       XFS_FSB_TO_B(mp, map[i].br_startoff));
-
-			/* Landed in a hole */
-			if (map[i].br_startblock == HOLESTARTBLOCK)
-				goto out;
-
-			/*
-			 * Landed in an unwritten extent, try to search hole
-			 * from page cache.
-			 */
-			if (map[i].br_state == XFS_EXT_UNWRITTEN) {
-				if (xfs_find_get_desired_pgoff(inode, &map[i],
-							HOLE_OFF, &offset))
-					goto out;
-			}
-		}
-
-		/*
-		 * map[0] contains data or its unwritten but contains
-		 * data in page cache, probably means that we are
-		 * reading after EOF.  We should fix offset to point
-		 * to the end of the file(i.e., there is an implicit
-		 * hole at the end of any file).
-		 */
-		if (nmap == 1) {
-			offset = isize;
-			break;
-		}
-
-		ASSERT(i > 1);
-
-		/*
-		 * Both mappings contains data, proceed to the next round of
-		 * search if the current reading offset not beyond or hit EOF.
-		 */
-		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
-		start = XFS_FSB_TO_B(mp, fsbno);
-		if (start >= isize) {
-			offset = isize;
-			break;
-		}
-	}
-
-out:
 	/*
-	 * At this point, we must have found a hole.  However, the returned
+	 * If at this point we have found the hole we wanted, the returned
 	 * offset may be bigger than the file size as it may be aligned to
-	 * page boundary for unwritten extents, we need to deal with this
+	 * page boundary for unwritten extents.  We need to deal with this
 	 * situation in particular.
 	 */
-	offset = min_t(loff_t, offset, isize);
+	if (whence == SEEK_HOLE)
+		offset = min_t(loff_t, offset, isize);
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
@@ -1400,10 +1329,9 @@ xfs_file_llseek(
 	case SEEK_CUR:
 	case SEEK_SET:
 		return generic_file_llseek(file, offset, origin);
-	case SEEK_DATA:
-		return xfs_seek_data(file, offset);
 	case SEEK_HOLE:
-		return xfs_seek_hole(file, offset);
+	case SEEK_DATA:
+		return xfs_seek_hole_data(file, offset, origin);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 59f9c004320704179913fa7c57645017ccf1b5c3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 9 Sep 2014 11:57:10 +1000
Subject: xfs: lseek: the "whence" argument is called "whence"

For some reason, the older commit:

    965c8e5 lseek: the "whence" argument is called "whence"

    lseek: the "whence" argument is called "whence"

    But the kernel decided to call it "origin" instead.
    Fix most of the sites.

left out xfs.  So fix xfs.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1da3b7d..0fe36e4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1322,16 +1322,16 @@ STATIC loff_t
 xfs_file_llseek(
 	struct file	*file,
 	loff_t		offset,
-	int		origin)
+	int		whence)
 {
-	switch (origin) {
+	switch (whence) {
 	case SEEK_END:
 	case SEEK_CUR:
 	case SEEK_SET:
-		return generic_file_llseek(file, offset, origin);
+		return generic_file_llseek(file, offset, whence);
 	case SEEK_HOLE:
 	case SEEK_DATA:
-		return xfs_seek_hole_data(file, offset, origin);
+		return xfs_seek_hole_data(file, offset, whence);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 970fd3f04d5949a4b5f6d0a5fea8e4b6797a5992 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 9 Sep 2014 11:57:29 +1000
Subject: xfs: deduplicate xlog_do_recovery_pass()

In xlog_do_recovery_pass(), there are 2 distinct cases:
non-wrapped and wrapped log recovery.

If we find a wrapped log, we recover around the end
of the log, and then handle the rest of recovery
exactly as in the non-wrapped case - using exactly the same
(duplicated) code.

Rather than having the same code in both cases, we can
get the wrapped portion out of the way first if needed,
and then recover the non-wrapped portion of the log.

There should be no functional change here, just code
reorganization & deduplication.

The patch looks a bit bigger than it really is; the last
hunk is whitespace changes (un-indenting).

Tested with xfstests "check -g log" on a stock configuration.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 176c4b3..29e101f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4132,41 +4132,13 @@ xlog_do_recovery_pass(
 	}
 
 	memset(rhash, 0, sizeof(rhash));
-	if (tail_blk <= head_blk) {
-		for (blk_no = tail_blk; blk_no < head_blk; ) {
-			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
-			if (error)
-				goto bread_err2;
-
-			rhead = (xlog_rec_header_t *)offset;
-			error = xlog_valid_rec_header(log, rhead, blk_no);
-			if (error)
-				goto bread_err2;
-
-			/* blocks in data section */
-			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-			error = xlog_bread(log, blk_no + hblks, bblks, dbp,
-					   &offset);
-			if (error)
-				goto bread_err2;
-
-			error = xlog_unpack_data(rhead, offset, log);
-			if (error)
-				goto bread_err2;
-
-			error = xlog_recover_process_data(log,
-						rhash, rhead, offset, pass);
-			if (error)
-				goto bread_err2;
-			blk_no += bblks + hblks;
-		}
-	} else {
+	blk_no = tail_blk;
+	if (tail_blk > head_blk) {
 		/*
 		 * Perform recovery around the end of the physical log.
 		 * When the head is not on the same cycle number as the tail,
-		 * we can't do a sequential recovery as above.
+		 * we can't do a sequential recovery.
 		 */
-		blk_no = tail_blk;
 		while (blk_no < log->l_logBBsize) {
 			/*
 			 * Check for header wrapping around physical end-of-log
@@ -4280,34 +4252,35 @@ xlog_do_recovery_pass(
 
 		ASSERT(blk_no >= log->l_logBBsize);
 		blk_no -= log->l_logBBsize;
+	}
 
-		/* read first part of physical log */
-		while (blk_no < head_blk) {
-			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
-			if (error)
-				goto bread_err2;
+	/* read first part of physical log */
+	while (blk_no < head_blk) {
+		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+		if (error)
+			goto bread_err2;
 
-			rhead = (xlog_rec_header_t *)offset;
-			error = xlog_valid_rec_header(log, rhead, blk_no);
-			if (error)
-				goto bread_err2;
+		rhead = (xlog_rec_header_t *)offset;
+		error = xlog_valid_rec_header(log, rhead, blk_no);
+		if (error)
+			goto bread_err2;
 
-			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-			error = xlog_bread(log, blk_no+hblks, bblks, dbp,
-					   &offset);
-			if (error)
-				goto bread_err2;
+		/* blocks in data section */
+		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+				   &offset);
+		if (error)
+			goto bread_err2;
 
-			error = xlog_unpack_data(rhead, offset, log);
-			if (error)
-				goto bread_err2;
+		error = xlog_unpack_data(rhead, offset, log);
+		if (error)
+			goto bread_err2;
 
-			error = xlog_recover_process_data(log, rhash,
-							rhead, offset, pass);
-			if (error)
-				goto bread_err2;
-			blk_no += bblks + hblks;
-		}
+		error = xlog_recover_process_data(log, rhash,
+						rhead, offset, pass);
+		if (error)
+			goto bread_err2;
+		blk_no += bblks + hblks;
 	}
 
  bread_err2:
-- 
cgit v0.10.2


From 94f3cad555d66048906deade06a764f7ea2c6e4d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:57:52 +1000
Subject: xfs: check resblks before calling xfs_dir_canenter

Move the resblks test out of the xfs_dir_canenter,
and into the caller.

This makes a little more sense on the face of it;
xfs_dir_canenter immediately returns if resblks !=0;
and given some of the comments preceding the calls:

 * Check for ability to enter directory entry, if no space reserved.

even more so.

It also facilitates the next patch.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 6cef221..ea84e1c 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -535,22 +535,17 @@ out_free:
 
 /*
  * See if this entry can be added to the directory without allocating space.
- * First checks that the caller couldn't reserve enough space (resblks = 0).
  */
 int
 xfs_dir_canenter(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	struct xfs_name	*name,		/* name of entry to add */
-	uint		resblks)
+	struct xfs_name	*name)		/* name of entry to add */
 {
 	struct xfs_da_args *args;
 	int		rval;
 	int		v;		/* type-checking value */
 
-	if (resblks)
-		return 0;
-
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0..4dff261 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-				struct xfs_name *name, uint resblks);
+				struct xfs_name *name);
 
 /*
  * Direct call from the bmap code, bypassing the generic directory layer.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fea3c92..c92cb48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1153,9 +1153,11 @@ xfs_create(
 	if (error)
 		goto out_trans_cancel;
 
-	error = xfs_dir_canenter(tp, dp, name, resblks);
-	if (error)
-		goto out_trans_cancel;
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, name);
+		if (error)
+			goto out_trans_cancel;
+	}
 
 	/*
 	 * A newly created regular or special file just has one directory
@@ -1421,9 +1423,11 @@ xfs_link(
 		goto error_return;
 	}
 
-	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
-	if (error)
-		goto error_return;
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, tdp, target_name);
+		if (error)
+			goto error_return;
+	}
 
 	xfs_bmap_init(&free_list, &first_block);
 
@@ -2759,9 +2763,11 @@ xfs_rename(
 		 * If there's no space reservation, check the entry will
 		 * fit before actually inserting it.
 		 */
-		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
-		if (error)
-			goto error_return;
+		if (!spaceres) {
+			error = xfs_dir_canenter(tp, target_dp, target_name);
+			if (error)
+				goto error_return;
+		}
 		/*
 		 * If target does not exist and the rename crosses
 		 * directories, adjust the target directory link count
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 6a944a2..02ae62a 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -269,9 +269,11 @@ xfs_symlink(
 	/*
 	 * Check for ability to enter directory entry, if no space reserved.
 	 */
-	error = xfs_dir_canenter(tp, dp, link_name, resblks);
-	if (error)
-		goto error_return;
+	if (!resblks) {
+		error = xfs_dir_canenter(tp, dp, link_name);
+		if (error)
+			goto error_return;
+	}
 	/*
 	 * Initialize the bmap freelist prior to calling either
 	 * bmapi or the directory create code.
-- 
cgit v0.10.2


From b16ed7c114b8cca45fa87b675c431f43ff90c179 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:58:07 +1000
Subject: xfs: combine xfs_dir_canenter into xfs_dir_createname

xfs_dir_canenter and xfs_dir_createname are
almost identical.

Fold the former into the latter, with a helpful
wrapper for the former.  If createname is called without
an inode number, it now only checks for space, and does
not actually add the entry.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index ea84e1c..7075aaf 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -237,7 +237,8 @@ xfs_dir_init(
 }
 
 /*
-  Enter a name in a directory.
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
  */
 int
 xfs_dir_createname(
@@ -254,10 +255,12 @@ xfs_dir_createname(
 	int			v;		/* type-checking value */
 
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-	if (rval)
-		return rval;
-	XFS_STATS_INC(xs_dir_create);
+	if (inum) {
+		rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+		if (rval)
+			return rval;
+		XFS_STATS_INC(xs_dir_create);
+	}
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
@@ -276,6 +279,8 @@ xfs_dir_createname(
 	args->whichfork = XFS_DATA_FORK;
 	args->trans = tp;
 	args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+	if (!inum)
+		args->op_flags |= XFS_DA_OP_JUSTCHECK;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
 		rval = xfs_dir2_sf_addname(args);
@@ -542,50 +547,7 @@ xfs_dir_canenter(
 	xfs_inode_t	*dp,
 	struct xfs_name	*name)		/* name of entry to add */
 {
-	struct xfs_da_args *args;
-	int		rval;
-	int		v;		/* type-checking value */
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return -ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->dp = dp;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-	args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
-							XFS_DA_OP_OKNOENT;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_addname(args);
-	else
-		rval = xfs_dir2_node_addname(args);
-out_free:
-	kmem_free(args);
-	return rval;
+	return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0);
 }
 
 /*
-- 
cgit v0.10.2


From afabfd30d05264ff493c24bce310b6a5350f099b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:58:42 +1000
Subject: xfs: combine xfs_rtmodify_summary and xfs_rtget_summary

xfs_rtmodify_summary and xfs_rtget_summary are almost identical;
fold them into xfs_rtmodify_summary_int(), with wrappers for each of
the original calls.

The _int function modifies if a delta is passed, and returns a
summary pointer if *sum is passed.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697..50e3b93 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -424,20 +424,24 @@ xfs_rtfind_forw(
 }
 
 /*
- * Read and modify the summary information for a given extent size,
+ * Read and/or modify the summary information for a given extent size,
  * bitmap block combination.
  * Keeps track of a current summary block, so we don't keep reading
  * it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
  */
 int
-xfs_rtmodify_summary(
-	xfs_mount_t	*mp,		/* file system mount point */
+xfs_rtmodify_summary_int(
+	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
 	int		log,		/* log2 of extent size */
 	xfs_rtblock_t	bbno,		/* bitmap block number */
 	int		delta,		/* change to make to summary info */
 	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
 {
 	xfs_buf_t	*bp;		/* buffer for the summary block */
 	int		error;		/* error value */
@@ -480,15 +484,40 @@ xfs_rtmodify_summary(
 		}
 	}
 	/*
-	 * Point to the summary information, modify and log it.
+	 * Point to the summary information, modify/log it, and/or copy it out.
 	 */
 	sp = XFS_SUMPTR(mp, bp, so);
-	*sp += delta;
-	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-		(uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+	if (delta) {
+		uint first = (uint)((char *)sp - (char *)bp->b_addr);
+
+		*sp += delta;
+		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+	}
+	if (sum) {
+		/*
+		 * Drop the buffer if we're not asked to remember it.
+		 */
+		if (!rbpp)
+			xfs_trans_brelse(tp, bp);
+		*sum = *sp;
+	}
 	return 0;
 }
 
+int
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+					delta, rbpp, rsb, NULL);
+}
+
 /*
  * Set the given range of bitmap bits to the given value.
  * Do whatever I/O and logging is required.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 909e143..d1160cc 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
  * Keeps track of a current summary block, so we don't keep reading
  * it from the buffer cache.
  */
-STATIC int				/* error */
+int
 xfs_rtget_summary(
 	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
@@ -56,60 +56,9 @@ xfs_rtget_summary(
 	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
 	xfs_suminfo_t	*sum)		/* out: summary info for this block */
 {
-	xfs_buf_t	*bp;		/* buffer for summary block */
-	int		error;		/* error value */
-	xfs_fsblock_t	sb;		/* summary fsblock */
-	int		so;		/* index into the summary file */
-	xfs_suminfo_t	*sp;		/* pointer to returned data */
-
-	/*
-	 * Compute entry number in the summary file.
-	 */
-	so = XFS_SUMOFFS(mp, log, bbno);
-	/*
-	 * Compute the block number in the summary file.
-	 */
-	sb = XFS_SUMOFFSTOBLOCK(mp, so);
-	/*
-	 * If we have an old buffer, and the block number matches, use that.
-	 */
-	if (rbpp && *rbpp && *rsb == sb)
-		bp = *rbpp;
-	/*
-	 * Otherwise we have to get the buffer.
-	 */
-	else {
-		/*
-		 * If there was an old one, get rid of it first.
-		 */
-		if (rbpp && *rbpp)
-			xfs_trans_brelse(tp, *rbpp);
-		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-		if (error) {
-			return error;
-		}
-		/*
-		 * Remember this buffer and block for the next call.
-		 */
-		if (rbpp) {
-			*rbpp = bp;
-			*rsb = sb;
-		}
-	}
-	/*
-	 * Point to the summary information & copy it out.
-	 */
-	sp = XFS_SUMPTR(mp, bp, so);
-	*sum = *sp;
-	/*
-	 * Drop the buffer if we're not asked to remember it.
-	 */
-	if (!rbpp)
-		xfs_trans_brelse(tp, bp);
-	return 0;
+	return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
 }
 
-
 /*
  * Return whether there are any free extents in the size range given
  * by low and high, for the bitmap block bbno.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index c642795..76c0a4a 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
 		    xfs_rtblock_t *rtblock);
 int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
 		       xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+			     int log, xfs_rtblock_t bbno, int delta,
+			     xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
+			     xfs_suminfo_t *sum);
 int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
 			 xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
 			 xfs_fsblock_t *rsb);
-- 
cgit v0.10.2


From ab6978c295b074eb2ba4b06fdf206c7ab4f293e5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 9 Sep 2014 11:59:12 +1000
Subject: xfs: remove rbpp check from xfs_rtmodify_summary_int

rbpp is always passed into xfs_rtmodify_summary
and xfs_rtget_summary, so there is no need to
test for it in xfs_rtmodify_summary_int.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 50e3b93..7c818f1 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -460,7 +460,7 @@ xfs_rtmodify_summary_int(
 	/*
 	 * If we have an old buffer, and the block number matches, use that.
 	 */
-	if (rbpp && *rbpp && *rsb == sb)
+	if (*rbpp && *rsb == sb)
 		bp = *rbpp;
 	/*
 	 * Otherwise we have to get the buffer.
@@ -469,7 +469,7 @@ xfs_rtmodify_summary_int(
 		/*
 		 * If there was an old one, get rid of it first.
 		 */
-		if (rbpp && *rbpp)
+		if (*rbpp)
 			xfs_trans_brelse(tp, *rbpp);
 		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
 		if (error) {
@@ -478,10 +478,8 @@ xfs_rtmodify_summary_int(
 		/*
 		 * Remember this buffer and block for the next call.
 		 */
-		if (rbpp) {
-			*rbpp = bp;
-			*rsb = sb;
-		}
+		*rbpp = bp;
+		*rsb = sb;
 	}
 	/*
 	 * Point to the summary information, modify/log it, and/or copy it out.
@@ -493,14 +491,8 @@ xfs_rtmodify_summary_int(
 		*sp += delta;
 		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
 	}
-	if (sum) {
-		/*
-		 * Drop the buffer if we're not asked to remember it.
-		 */
-		if (!rbpp)
-			xfs_trans_brelse(tp, bp);
+	if (sum)
 		*sum = *sp;
-	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 0d085a529b427d97710e6a41f8a4f23e1757cd12 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 23 Sep 2014 15:36:27 +1000
Subject: xfs: ensure WB_SYNC_ALL writeback handles partial pages correctly

XFS has been having trouble with stray delayed allocation extents
beyond EOF for a long time. Recent changes to the collapse range
code has triggered erroneous EBUSY errors on page invalidtion for
block size smaller than page size filesystems. These
have been caused by dirty buffers beyond EOF on a partial page which
do not get written to disk during a sync.

The issue is that write-ahead in xfs_cluster_write() finds such a
partial page and handles it by leaving the page dirty but pushing it
into a writeback state. This used to work just fine, as the
write_cache_pages() code would then find the dirty partial page in
the next mapping tree lookup as the dirty tag is still set.

Unfortunately, when we moved to a mark and sweep approach to
writeback to fix other writeback sync issues, we broken this. THe
act of marking the page as under writeback now clears the TOWRITE
tag in the radix tree, even though the page is still dirty. This
causes the TOWRITE tag to be cleared, and hence the next lookup on
the mapping tree does not find the dirty partial page and so doesn't
try to write it again.

This same writeback bug was found recently in ext4 and fixed in
commit 1c8349a ("ext4: fix data integrity sync in ordered mode")
without communication to the wider filesystem community. We can use
exactly the same fix here so the TOWRITE flag is not cleared on
partial page writes.

cc: stable@vger.kernel.org # dependent on 1c8349a17137b93f0a83f276c764a6df1b9a116e
Root-cause-found-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b984647..2f50253 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -434,10 +434,22 @@ xfs_start_page_writeback(
 {
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
-	if (clear_dirty)
+
+	/*
+	 * if the page was not fully cleaned, we need to ensure that the higher
+	 * layers come back to it correctly. That means we need to keep the page
+	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+	 * write this page in this writeback sweep will be made.
+	 */
+	if (clear_dirty) {
 		clear_page_dirty_for_io(page);
-	set_page_writeback(page);
+		set_page_writeback(page);
+	} else
+		set_page_writeback_keepwrite(page);
+
 	unlock_page(page);
+
 	/* If no buffers on the page are to be written, finish it here */
 	if (!buffers)
 		end_page_writeback(page);
-- 
cgit v0.10.2


From 2c845f5a5f238f42376b6551a7f7716952c8f509 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 23 Sep 2014 15:37:09 +1000
Subject: xfs: track collapse via file offset rather than extent index

The collapse range implementation uses a transaction per extent shift.
The progress of the overall operation is tracked via the current extent
index of the in-core extent list. This is racy because the ilock must be
dropped and reacquired for each transaction according to locking and log
reservation rules. Therefore, writeback to prior regions of the file is
possible and can change the extent count. This changes the extent to
which the current index refers and causes the collapse to fail mid
operation. To avoid this problem, the entire file is currently written
back before the collapse operation starts.

To eliminate the need to flush the entire file, use the file offset
(fsb) to track the progress of the overall extent shift operation rather
than the extent index. Modify xfs_bmap_shift_extents() to
unconditionally convert the start_fsb parameter to an extent index and
return the file offset of the extent where the shift left off, if
further extents exist. The bulk of ths function can remain based on
extent index as ilock is held by the caller. xfs_collapse_file_space()
now uses the fsb output as the starting point for the subsequent shift.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 86df952..4b3f1b9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5406,20 +5406,21 @@ error0:
 /*
  * Shift extent records to the left to cover a hole.
  *
- * The maximum number of extents to be shifted in a single operation
- * is @num_exts, and @current_ext keeps track of the current extent
- * index we have shifted. @offset_shift_fsb is the length by which each
- * extent is shifted. If there is no hole to shift the extents
- * into, this will be considered invalid operation and we abort immediately.
+ * The maximum number of extents to be shifted in a single operation is
+ * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
+ * is the length by which each extent is shifted. If there is no hole to shift
+ * the extents into, this will be considered invalid operation and we abort
+ * immediately.
  */
 int
 xfs_bmap_shift_extents(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
-	int			*done,
 	xfs_fileoff_t		start_fsb,
 	xfs_fileoff_t		offset_shift_fsb,
-	xfs_extnum_t		*current_ext,
+	int			*done,
+	xfs_fileoff_t		*next_fsb,
 	xfs_fsblock_t		*firstblock,
 	struct xfs_bmap_free	*flist,
 	int			num_exts)
@@ -5431,6 +5432,7 @@ xfs_bmap_shift_extents(
 	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_ifork		*ifp;
 	xfs_extnum_t			nexts = 0;
+	xfs_extnum_t			current_ext;
 	xfs_fileoff_t			startoff;
 	int				error = 0;
 	int				i;
@@ -5451,7 +5453,8 @@ xfs_bmap_shift_extents(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	ASSERT(current_ext != NULL);
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5462,20 +5465,18 @@ xfs_bmap_shift_extents(
 	}
 
 	/*
-	 * If *current_ext is 0, we would need to lookup the extent
-	 * from where we would start shifting and store it in gotp.
+	 * Look up the extent index for the fsb where we start shifting. We can
+	 * henceforth iterate with current_ext as extent list changes are locked
+	 * out via ilock.
+	 *
+	 * gotp can be null in 2 cases: 1) if there are no extents or 2)
+	 * start_fsb lies in a hole beyond which there are no extents. Either
+	 * way, we are done.
 	 */
-	if (!*current_ext) {
-		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
-		/*
-		 * gotp can be null in 2 cases: 1) if there are no extents
-		 * or 2) start_fsb lies in a hole beyond which there are
-		 * no extents. Either way, we are done.
-		 */
-		if (!gotp) {
-			*done = 1;
-			return 0;
-		}
+	gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+	if (!gotp) {
+		*done = 1;
+		return 0;
 	}
 
 	if (ifp->if_flags & XFS_IFBROOT) {
@@ -5487,36 +5488,32 @@ xfs_bmap_shift_extents(
 
 	/*
 	 * There may be delalloc extents in the data fork before the range we
-	 * are collapsing out, so we cannot
-	 * use the count of real extents here. Instead we have to calculate it
-	 * from the incore fork.
+	 * are collapsing out, so we cannot use the count of real extents here.
+	 * Instead we have to calculate it from the incore fork.
 	 */
 	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	while (nexts++ < num_exts && *current_ext < total_extents) {
+	while (nexts++ < num_exts && current_ext < total_extents) {
 
-		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		gotp = xfs_iext_get_ext(ifp, current_ext);
 		xfs_bmbt_get_all(gotp, &got);
 		startoff = got.br_startoff - offset_shift_fsb;
 
 		/*
-		 * Before shifting extent into hole, make sure that the hole
-		 * is large enough to accomodate the shift.
+		 * Before shifting extent into hole, make sure that the hole is
+		 * large enough to accommodate the shift.
 		 */
-		if (*current_ext) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-						*current_ext - 1), &left);
-
+		if (current_ext > 0) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, current_ext - 1),
+					 &left);
 			if (startoff < left.br_startoff + left.br_blockcount)
 				error = -EINVAL;
 		} else if (offset_shift_fsb > got.br_startoff) {
 			/*
-			 * When first extent is shifted, offset_shift_fsb
-			 * should be less than the stating offset of
-			 * the first extent.
+			 * When first extent is shifted, offset_shift_fsb should
+			 * be less than the stating offset of the first extent.
 			 */
 			error = -EINVAL;
 		}
-
 		if (error)
 			goto del_cursor;
 
@@ -5531,7 +5528,7 @@ xfs_bmap_shift_extents(
 		}
 
 		/* Check if we can merge 2 adjacent extents */
-		if (*current_ext &&
+		if (current_ext &&
 		    left.br_startoff + left.br_blockcount == startoff &&
 		    left.br_startblock + left.br_blockcount ==
 				got.br_startblock &&
@@ -5539,7 +5536,7 @@ xfs_bmap_shift_extents(
 		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
 			blockcount = left.br_blockcount +
 				got.br_blockcount;
-			xfs_iext_remove(ip, *current_ext, 1, 0);
+			xfs_iext_remove(ip, current_ext, 1, 0);
 			logflags |= XFS_ILOG_CORE;
 			if (cur) {
 				error = xfs_btree_delete(cur, &i);
@@ -5551,7 +5548,7 @@ xfs_bmap_shift_extents(
 			}
 			XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-			gotp = xfs_iext_get_ext(ifp, --*current_ext);
+			gotp = xfs_iext_get_ext(ifp, --current_ext);
 			xfs_bmbt_get_all(gotp, &got);
 
 			/* Make cursor point to the extent we will update */
@@ -5585,13 +5582,18 @@ xfs_bmap_shift_extents(
 			logflags |= XFS_ILOG_DEXT;
 		}
 
-		(*current_ext)++;
+		current_ext++;
 		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
 	}
 
 	/* Check if we are done */
-	if (*current_ext == total_extents)
+	if (current_ext == total_extents)
 		*done = 1;
+	else if (next_fsb) {
+		gotp = xfs_iext_get_ext(ifp, current_ext);
+		xfs_bmbt_get_all(gotp, &got);
+		*next_fsb = got.br_startoff;
+	}
 
 del_cursor:
 	if (cur)
@@ -5600,5 +5602,6 @@ del_cursor:
 
 	if (logflags)
 		xfs_trans_log_inode(tp, ip, logflags);
+
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca5..44db6db 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -178,9 +178,8 @@ int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
 int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-		int *done, xfs_fileoff_t start_fsb,
-		xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
-		xfs_fsblock_t *firstblock, struct xfs_bmap_free	*flist,
-		int num_exts);
+		xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
+		int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
+		struct xfs_bmap_free *flist, int num_exts);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1707980..1e96d77 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1456,18 +1456,18 @@ xfs_collapse_file_space(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	int			error;
-	xfs_extnum_t		current_ext = 0;
 	struct xfs_bmap_free	free_list;
 	xfs_fsblock_t		first_block;
 	int			committed;
 	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		next_fsb;
 	xfs_fileoff_t		shift_fsb;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 
 	trace_xfs_collapse_file_space(ip);
 
-	start_fsb = XFS_B_TO_FSB(mp, offset + len);
+	next_fsb = XFS_B_TO_FSB(mp, offset + len);
 	shift_fsb = XFS_B_TO_FSB(mp, len);
 
 	/*
@@ -1525,10 +1525,10 @@ xfs_collapse_file_space(
 		 * We are using the write transaction in which max 2 bmbt
 		 * updates are allowed
 		 */
-		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
-					       shift_fsb, &current_ext,
-					       &first_block, &free_list,
-					       XFS_BMAP_MAX_SHIFT_EXTENTS);
+		start_fsb = next_fsb;
+		error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
+				&done, &next_fsb, &first_block, &free_list,
+				XFS_BMAP_MAX_SHIFT_EXTENTS);
 		if (error)
 			goto out;
 
-- 
cgit v0.10.2


From ddb19e3180fa42362a04e86771d758be1de0bb13 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 23 Sep 2014 15:38:09 +1000
Subject: xfs: refactor shift-by-merge into xfs_bmse_merge() helper

The extent shift mechanism in xfs_bmap_shift_extents() is complicated
and handles several different, non-deterministic scenarios. These
include extent shifts, extent merges and potential btree updates in
either of the former scenarios.

Refactor the code to be more linear and readable. The loop logic in
xfs_bmap_shift_extents() and some initial error checking is adjusted
slightly. The associated btree lookup and update/delete operations are
condensed into single blocks of code. This reduces the number of
btree-specific blocks and facilitates the separation of the merge
operation into a new xfs_bmse_merge() and xfs_bmse_can_merge() helpers.

This is a code refactor only. The behavior of extent shift and collapse
range is not modified.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4b3f1b9..532c4aa 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5404,6 +5404,120 @@ error0:
 }
 
 /*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+	struct xfs_bmbt_irec	*left,	/* preceding extent */
+	struct xfs_bmbt_irec	*got,	/* current extent to shift */
+	xfs_fileoff_t		shift)	/* shift fsb */
+{
+	xfs_fileoff_t		startoff;
+
+	startoff = got->br_startoff - shift;
+
+	/*
+	 * The extent, once shifted, must be adjacent in-file and on-disk with
+	 * the preceding extent.
+	 */
+	if ((left->br_startoff + left->br_blockcount != startoff) ||
+	    (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+	    (left->br_state != got->br_state) ||
+	    (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			shift,		/* shift fsb */
+	int				current_ext,	/* idx of gotp */
+	struct xfs_bmbt_rec_host	*gotp,		/* extent to shift */
+	struct xfs_bmbt_rec_host	*leftp,		/* preceding extent */
+	struct xfs_btree_cur		*cur,
+	int				*logflags)	/* output */
+{
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		left;
+	xfs_filblks_t			blockcount;
+	int				error, i;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmbt_get_all(gotp, &got);
+	xfs_bmbt_get_all(leftp, &left);
+	blockcount = left.br_blockcount + got.br_blockcount;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_bmse_can_merge(&left, &got, shift));
+
+	/*
+	 * Merge the in-core extents. Note that the host record pointers and
+	 * current_ext index are invalid once the extent has been removed via
+	 * xfs_iext_remove().
+	 */
+	xfs_bmbt_set_blockcount(leftp, blockcount);
+	xfs_iext_remove(ip, current_ext, 1, 0);
+
+	/*
+	 * Update the on-disk extent count, the btree if necessary and log the
+	 * inode.
+	 */
+	XFS_IFORK_NEXT_SET(ip, whichfork,
+			   XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+	*logflags |= XFS_ILOG_CORE;
+	if (!cur) {
+		*logflags |= XFS_ILOG_DEXT;
+		return 0;
+	}
+
+	/* lookup and remove the extent to merge */
+	error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+				   got.br_blockcount, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+	error = xfs_btree_delete(cur, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+	/* lookup and update size of the previous extent */
+	error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
+				   left.br_blockcount, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+	left.br_blockcount = blockcount;
+
+	error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+				left.br_blockcount, left.br_state);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	return error;
+}
+
+/*
  * Shift extent records to the left to cover a hole.
  *
  * The maximum number of extents to be shifted in a single operation is
@@ -5427,6 +5541,7 @@ xfs_bmap_shift_extents(
 {
 	struct xfs_btree_cur		*cur = NULL;
 	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_rec_host	*leftp;
 	struct xfs_bmbt_irec            got;
 	struct xfs_bmbt_irec		left;
 	struct xfs_mount		*mp = ip->i_mount;
@@ -5438,7 +5553,6 @@ xfs_bmap_shift_extents(
 	int				i;
 	int				whichfork = XFS_DATA_FORK;
 	int				logflags = 0;
-	xfs_filblks_t			blockcount = 0;
 	int				total_extents;
 
 	if (unlikely(XFS_TEST_ERROR(
@@ -5464,6 +5578,13 @@ xfs_bmap_shift_extents(
 			return error;
 	}
 
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	}
+
 	/*
 	 * Look up the extent index for the fsb where we start shifting. We can
 	 * henceforth iterate with current_ext as extent list changes are locked
@@ -5476,14 +5597,17 @@ xfs_bmap_shift_extents(
 	gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
 	if (!gotp) {
 		*done = 1;
-		return 0;
+		goto del_cursor;
 	}
+	xfs_bmbt_get_all(gotp, &got);
 
-	if (ifp->if_flags & XFS_IFBROOT) {
-		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_private.b.firstblock = *firstblock;
-		cur->bc_private.b.flist = flist;
-		cur->bc_private.b.flags = 0;
+	/*
+	 * If the first extent is shifted, offset_shift_fsb cannot be larger
+	 * than the starting offset of the first extent.
+	 */
+	if (current_ext == 0 && got.br_startoff < offset_shift_fsb) {
+		error = -EINVAL;
+		goto del_cursor;
 	}
 
 	/*
@@ -5493,30 +5617,41 @@ xfs_bmap_shift_extents(
 	 */
 	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
 	while (nexts++ < num_exts && current_ext < total_extents) {
-
-		gotp = xfs_iext_get_ext(ifp, current_ext);
-		xfs_bmbt_get_all(gotp, &got);
 		startoff = got.br_startoff - offset_shift_fsb;
 
-		/*
-		 * Before shifting extent into hole, make sure that the hole is
-		 * large enough to accommodate the shift.
-		 */
+		/* grab the left extent and check for a potential merge */
 		if (current_ext > 0) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, current_ext - 1),
-					 &left);
-			if (startoff < left.br_startoff + left.br_blockcount)
+			leftp = xfs_iext_get_ext(ifp, current_ext - 1);
+			xfs_bmbt_get_all(leftp, &left);
+
+			/* make sure hole is large enough for shift */
+			if (startoff < left.br_startoff + left.br_blockcount) {
 				error = -EINVAL;
-		} else if (offset_shift_fsb > got.br_startoff) {
-			/*
-			 * When first extent is shifted, offset_shift_fsb should
-			 * be less than the stating offset of the first extent.
-			 */
-			error = -EINVAL;
+				goto del_cursor;
+			}
+
+			if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+				error = xfs_bmse_merge(ip, whichfork,
+						offset_shift_fsb, current_ext, gotp,
+						leftp, cur, &logflags);
+				if (error)
+					goto del_cursor;
+
+				/*
+				 * The extent was merged so adjust the extent
+				 * index and move onto the next.
+				 */
+				current_ext--;
+				goto next;
+			}
 		}
-		if (error)
-			goto del_cursor;
 
+		/*
+		 * We didn't merge the extent so do the shift. Update the start
+		 * offset in the in-core extent and btree, if necessary.
+		 */
+		xfs_bmbt_set_startoff(gotp, startoff);
+		logflags |= XFS_ILOG_CORE;
 		if (cur) {
 			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
 						   got.br_startblock,
@@ -5525,53 +5660,8 @@ xfs_bmap_shift_extents(
 			if (error)
 				goto del_cursor;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-		}
-
-		/* Check if we can merge 2 adjacent extents */
-		if (current_ext &&
-		    left.br_startoff + left.br_blockcount == startoff &&
-		    left.br_startblock + left.br_blockcount ==
-				got.br_startblock &&
-		    left.br_state == got.br_state &&
-		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
-			blockcount = left.br_blockcount +
-				got.br_blockcount;
-			xfs_iext_remove(ip, current_ext, 1, 0);
-			logflags |= XFS_ILOG_CORE;
-			if (cur) {
-				error = xfs_btree_delete(cur, &i);
-				if (error)
-					goto del_cursor;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-			} else {
-				logflags |= XFS_ILOG_DEXT;
-			}
-			XFS_IFORK_NEXT_SET(ip, whichfork,
-				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-			gotp = xfs_iext_get_ext(ifp, --current_ext);
-			xfs_bmbt_get_all(gotp, &got);
-
-			/* Make cursor point to the extent we will update */
-			if (cur) {
-				error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-							   got.br_startblock,
-							   got.br_blockcount,
-							   &i);
-				if (error)
-					goto del_cursor;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-			}
 
-			xfs_bmbt_set_blockcount(gotp, blockcount);
-			got.br_blockcount = blockcount;
-		} else {
-			/* We have to update the startoff */
-			xfs_bmbt_set_startoff(gotp, startoff);
 			got.br_startoff = startoff;
-		}
-
-		logflags |= XFS_ILOG_CORE;
-		if (cur) {
 			error = xfs_bmbt_update(cur, got.br_startoff,
 						got.br_startblock,
 						got.br_blockcount,
@@ -5582,18 +5672,20 @@ xfs_bmap_shift_extents(
 			logflags |= XFS_ILOG_DEXT;
 		}
 
-		current_ext++;
+next:
+		/* update total extent count and grab the next record */
 		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+		if (++current_ext >= total_extents)
+			break;
+		gotp = xfs_iext_get_ext(ifp, current_ext);
+		xfs_bmbt_get_all(gotp, &got);
 	}
 
 	/* Check if we are done */
 	if (current_ext == total_extents)
 		*done = 1;
-	else if (next_fsb) {
-		gotp = xfs_iext_get_ext(ifp, current_ext);
-		xfs_bmbt_get_all(gotp, &got);
+	else if (next_fsb)
 		*next_fsb = got.br_startoff;
-	}
 
 del_cursor:
 	if (cur)
-- 
cgit v0.10.2


From a979bdfea10a61dce0055b4d416d640f4f5f495e Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 23 Sep 2014 15:39:04 +1000
Subject: xfs: refactor single extent shift into xfs_bmse_shift_one() helper

xfs_bmap_shift_extents() has a variety of conditions and error checks
that make the logic difficult to follow and indent heavy. Refactor the
loop body of this function into a new xfs_bmse_shift_one() helper. This
simplifies the error checks, eliminates index decrement on merge hack by
pushing the index increment down into the helper, and makes the code
more readable by reducing multiple levels of indentation.

This is a code refactor only. The behavior of extent shift and collapse
range is not modified.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 532c4aa..69bf8d8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5518,6 +5518,88 @@ out_error:
 }
 
 /*
+ * Shift a single extent.
+ */
+STATIC int
+xfs_bmse_shift_one(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			offset_shift_fsb,
+	int				*current_ext,
+	struct xfs_bmbt_rec_host	*gotp,
+	struct xfs_btree_cur		*cur,
+	int				*logflags)
+{
+	struct xfs_ifork		*ifp;
+	xfs_fileoff_t			startoff;
+	struct xfs_bmbt_rec_host	*leftp;
+	struct xfs_bmbt_irec		got;
+	struct xfs_bmbt_irec		left;
+	int				error;
+	int				i;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	xfs_bmbt_get_all(gotp, &got);
+	startoff = got.br_startoff - offset_shift_fsb;
+
+	/*
+	 * If this is the first extent in the file, make sure there's enough
+	 * room at the start of the file and jump right to the shift as there's
+	 * no left extent to merge.
+	 */
+	if (*current_ext == 0) {
+		if (got.br_startoff < offset_shift_fsb)
+			return -EINVAL;
+		goto shift_extent;
+	}
+
+	/* grab the left extent and check for a large enough hole */
+	leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+	xfs_bmbt_get_all(leftp, &left);
+
+	if (startoff < left.br_startoff + left.br_blockcount)
+		return -EINVAL;
+
+	/* check whether to merge the extent or shift it down */
+	if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
+		goto shift_extent;
+
+	return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
+			      gotp, leftp, cur, logflags);
+
+shift_extent:
+	/*
+	 * Increment the extent index for the next iteration, update the start
+	 * offset of the in-core extent and update the btree if applicable.
+	 */
+	(*current_ext)++;
+	xfs_bmbt_set_startoff(gotp, startoff);
+	*logflags |= XFS_ILOG_CORE;
+	if (!cur) {
+		*logflags |= XFS_ILOG_DEXT;
+		return 0;
+	}
+
+	error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+				   got.br_blockcount, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+	got.br_startoff = startoff;
+	error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+				got.br_blockcount, got.br_state);
+	if (error)
+		return error;
+
+	return 0;
+
+out_error:
+	return error;
+}
+
+/*
  * Shift extent records to the left to cover a hole.
  *
  * The maximum number of extents to be shifted in a single operation is
@@ -5541,16 +5623,12 @@ xfs_bmap_shift_extents(
 {
 	struct xfs_btree_cur		*cur = NULL;
 	struct xfs_bmbt_rec_host	*gotp;
-	struct xfs_bmbt_rec_host	*leftp;
 	struct xfs_bmbt_irec            got;
-	struct xfs_bmbt_irec		left;
 	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_ifork		*ifp;
 	xfs_extnum_t			nexts = 0;
 	xfs_extnum_t			current_ext;
-	xfs_fileoff_t			startoff;
 	int				error = 0;
-	int				i;
 	int				whichfork = XFS_DATA_FORK;
 	int				logflags = 0;
 	int				total_extents;
@@ -5599,16 +5677,6 @@ xfs_bmap_shift_extents(
 		*done = 1;
 		goto del_cursor;
 	}
-	xfs_bmbt_get_all(gotp, &got);
-
-	/*
-	 * If the first extent is shifted, offset_shift_fsb cannot be larger
-	 * than the starting offset of the first extent.
-	 */
-	if (current_ext == 0 && got.br_startoff < offset_shift_fsb) {
-		error = -EINVAL;
-		goto del_cursor;
-	}
 
 	/*
 	 * There may be delalloc extents in the data fork before the range we
@@ -5617,75 +5685,25 @@ xfs_bmap_shift_extents(
 	 */
 	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
 	while (nexts++ < num_exts && current_ext < total_extents) {
-		startoff = got.br_startoff - offset_shift_fsb;
-
-		/* grab the left extent and check for a potential merge */
-		if (current_ext > 0) {
-			leftp = xfs_iext_get_ext(ifp, current_ext - 1);
-			xfs_bmbt_get_all(leftp, &left);
-
-			/* make sure hole is large enough for shift */
-			if (startoff < left.br_startoff + left.br_blockcount) {
-				error = -EINVAL;
-				goto del_cursor;
-			}
-
-			if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
-				error = xfs_bmse_merge(ip, whichfork,
-						offset_shift_fsb, current_ext, gotp,
-						leftp, cur, &logflags);
-				if (error)
-					goto del_cursor;
-
-				/*
-				 * The extent was merged so adjust the extent
-				 * index and move onto the next.
-				 */
-				current_ext--;
-				goto next;
-			}
-		}
-
-		/*
-		 * We didn't merge the extent so do the shift. Update the start
-		 * offset in the in-core extent and btree, if necessary.
-		 */
-		xfs_bmbt_set_startoff(gotp, startoff);
-		logflags |= XFS_ILOG_CORE;
-		if (cur) {
-			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-						   got.br_startblock,
-						   got.br_blockcount,
-						   &i);
-			if (error)
-				goto del_cursor;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-
-			got.br_startoff = startoff;
-			error = xfs_bmbt_update(cur, got.br_startoff,
-						got.br_startblock,
-						got.br_blockcount,
-						got.br_state);
-			if (error)
-				goto del_cursor;
-		} else {
-			logflags |= XFS_ILOG_DEXT;
-		}
+		error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
+					&current_ext, gotp, cur, &logflags);
+		if (error)
+			goto del_cursor;
 
-next:
 		/* update total extent count and grab the next record */
 		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-		if (++current_ext >= total_extents)
+		if (current_ext >= total_extents)
 			break;
 		gotp = xfs_iext_get_ext(ifp, current_ext);
-		xfs_bmbt_get_all(gotp, &got);
 	}
 
 	/* Check if we are done */
-	if (current_ext == total_extents)
+	if (current_ext == total_extents) {
 		*done = 1;
-	else if (next_fsb)
+	} else if (next_fsb) {
+		xfs_bmbt_get_all(gotp, &got);
 		*next_fsb = got.br_startoff;
+	}
 
 del_cursor:
 	if (cur)
-- 
cgit v0.10.2


From f71721d061e872a39b2680d13f309c1eb6893438 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 23 Sep 2014 15:39:05 +1000
Subject: xfs: writeback and inval. file range to be shifted by collapse

The collapse range operation currently writes the entire file before
starting the collapse to avoid changes in the in-core extent list due to
writeback causing the extent count to change. Now that collapse range is
fsb based rather than extent index based it can sustain changes in the
extent list during the shift sequence without disruption.

Modify xfs_collapse_file_space() to writeback and invalidate pages
associated with the range of the file to be shifted.
xfs_free_file_space() currently has similar behavior, but the space free
need only affect the region of the file that is freed and this could
change in the future.

Also update the comments to reflect the current implementation. We
retain the eofblocks trim permanently as a best option for dealing with
delalloc extents. We don't shift delalloc extents because this scenario
only occurs with post-eof preallocation (since data must be flushed such
that the cache can be invalidated and data can be shifted). That means
said space must also be initialized before being shifted into the
accessible region of the file only to be immediately truncated off as
the last part of the collapse. In other words, the eofblocks trim will
happen anyways, we just run it first to ensure the file remains in a
consistent state throughout the collapse.

Finally, detect and fail explicitly in the event of a delalloc extent
during the extent shift. The implementation does not support delalloc
extents and the caller is expected to prevent this scenario in advance
as is done by collapse.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 69bf8d8..79c9819 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5543,6 +5543,10 @@ xfs_bmse_shift_one(
 	xfs_bmbt_get_all(gotp, &got);
 	startoff = got.br_startoff - offset_shift_fsb;
 
+	/* delalloc extents should be prevented by caller */
+	XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
+				out_error);
+
 	/*
 	 * If this is the first extent in the file, make sure there's enough
 	 * room at the start of the file and jump right to the shift as there's
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1e96d77..eae763f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,27 +1470,33 @@ xfs_collapse_file_space(
 	next_fsb = XFS_B_TO_FSB(mp, offset + len);
 	shift_fsb = XFS_B_TO_FSB(mp, len);
 
-	/*
-	 * Writeback the entire file and force remove any post-eof blocks. The
-	 * writeback prevents changes to the extent list via concurrent
-	 * writeback and the eofblocks trim prevents the extent shift algorithm
-	 * from running into a post-eof delalloc extent.
-	 *
-	 * XXX: This is a temporary fix until the extent shift loop below is
-	 * converted to use offsets and lookups within the ILOCK rather than
-	 * carrying around the index into the extent list for the next
-	 * iteration.
-	 */
-	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	error = xfs_free_file_space(ip, offset, len);
 	if (error)
 		return error;
+
+	/*
+	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
+	 * into the accessible region of the file.
+	 */
 	if (xfs_can_free_eofblocks(ip, true)) {
 		error = xfs_free_eofblocks(mp, ip, false);
 		if (error)
 			return error;
 	}
 
-	error = xfs_free_file_space(ip, offset, len);
+	/*
+	 * Writeback and invalidate cache for the remainder of the file as we're
+	 * about to shift down every extent from the collapse range to EOF. The
+	 * free of the collapse range above might have already done some of
+	 * this, but we shouldn't rely on it to do anything outside of the range
+	 * that was freed.
+	 */
+	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+					     offset + len, -1);
+	if (error)
+		return error;
+	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					(offset + len) >> PAGE_CACHE_SHIFT, -1);
 	if (error)
 		return error;
 
-- 
cgit v0.10.2


From 8b5279e33f241a074a9c8649bba8f77a2167b798 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 23 Sep 2014 15:39:05 +1000
Subject: xfs: only writeback and truncate pages for the freed range

xfs_free_file_space() only affects the range of the file for which space
is being freed. It currently writes and truncates the page cache from
the start offset of the free to EOF.

Modify xfs_free_file_space() to write back and truncate page cache of
just the range being freed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index eae763f..809ae7d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1205,6 +1205,7 @@ xfs_free_file_space(
 	xfs_bmap_free_t		free_list;
 	xfs_bmbt_irec_t		imap;
 	xfs_off_t		ioffset;
+	xfs_off_t		iendoffset;
 	xfs_extlen_t		mod=0;
 	xfs_mount_t		*mp;
 	int			nimap;
@@ -1233,12 +1234,13 @@ xfs_free_file_space(
 	inode_dio_wait(VFS_I(ip));
 
 	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-	ioffset = offset & ~(rounding - 1);
-	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-					      ioffset, -1);
+	ioffset = round_down(offset, rounding);
+	iendoffset = round_up(offset + len, rounding) - 1;
+	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
+					     iendoffset);
 	if (error)
 		goto out;
-	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
 
 	/*
 	 * Need to zero the stuff we're not freeing, on disk.
-- 
cgit v0.10.2


From 8af3dcd3c89aef10375bdd79d5f336b22b57487c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 23 Sep 2014 15:57:59 +1000
Subject: xfs: xlog_cil_force_lsn doesn't always wait correctly

When running a tight mount/unmount loop on an older kernel, RedHat
QE found that unmount would occasionally hang in
xfs_buf_unpin_wait() on the superblock buffer. Tracing and other
debug work by Eric Sandeen indicated that it was hanging on the
writing of the superblock during unmount immediately after logging
the superblock counters in a synchronous transaction. Further debug
indicated that the synchronous transaction was not waiting for
completion correctly, and we narrowed it down to
xlog_cil_force_lsn() returning NULLCOMMITLSN and hence not pushing
the transaction in the iclog buffer to disk correctly.

While this unmount superblock write code is now very different in
mainline kernels, the xlog_cil_force_lsn() code is identical, and it
was bisected to the backport of commit f876e44 ("xfs: always do log
forces via the workqueue"). This commit made the CIL push
asynchronous for log forces and hence exposed a race condition that
couldn't occur on a synchronous push.

Essentially, the xlog_cil_force_lsn() relied implicitly on the fact
that the sequence push would be complete by the time
xlog_cil_push_now() returned, resulting in the context being pushed
being in the committing list. When it was made asynchronous, it was
recognised that there was a race condition in detecting whether an
asynchronous push has started or not and code was added to handle
it.

Unfortunately, the fix was not quite right and left a race condition
where it it would detect an empty CIL while a push was in progress
before the context had been added to the committing list. This was
incorrectly seen as a "nothing to do" condition and so would tell
xfs_log_force_lsn() that there is nothing to wait for, and hence it
would push the iclogbufs in memory.

The fix is simple, but explaining the logic and the race condition
is a lot more complex. The fix is to add the context to the
committing list before we start emptying the CIL. This allows us to
detect the difference between an empty "do nothing" push and a push
that has not started by adding a discrete "emptying the CIL" state
to avoid the transient, incorrect "empty" condition that the
(unchanged) waiting code was seeing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f6b79e5..f506c45 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -463,12 +463,40 @@ xlog_cil_push(
 		spin_unlock(&cil->xc_push_lock);
 		goto out_skip;
 	}
-	spin_unlock(&cil->xc_push_lock);
 
 
 	/* check for a previously pushed seqeunce */
-	if (push_seq < cil->xc_ctx->sequence)
+	if (push_seq < cil->xc_ctx->sequence) {
+		spin_unlock(&cil->xc_push_lock);
 		goto out_skip;
+	}
+
+	/*
+	 * We are now going to push this context, so add it to the committing
+	 * list before we do anything else. This ensures that anyone waiting on
+	 * this push can easily detect the difference between a "push in
+	 * progress" and "CIL is empty, nothing to do".
+	 *
+	 * IOWs, a wait loop can now check for:
+	 *	the current sequence not being found on the committing list;
+	 *	an empty CIL; and
+	 *	an unchanged sequence number
+	 * to detect a push that had nothing to do and therefore does not need
+	 * waiting on. If the CIL is not empty, we get put on the committing
+	 * list before emptying the CIL and bumping the sequence number. Hence
+	 * an empty CIL and an unchanged sequence number means we jumped out
+	 * above after doing nothing.
+	 *
+	 * Hence the waiter will either find the commit sequence on the
+	 * committing list or the sequence number will be unchanged and the CIL
+	 * still dirty. In that latter case, the push has not yet started, and
+	 * so the waiter will have to continue trying to check the CIL
+	 * committing list until it is found. In extreme cases of delay, the
+	 * sequence may fully commit between the attempts the wait makes to wait
+	 * on the commit sequence.
+	 */
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_push_lock);
 
 	/*
 	 * pull all the log vectors off the items in the CIL, and
@@ -532,7 +560,6 @@ xlog_cil_push(
 	 */
 	spin_lock(&cil->xc_push_lock);
 	cil->xc_current_sequence = new_ctx->sequence;
-	list_add(&ctx->committing, &cil->xc_committing);
 	spin_unlock(&cil->xc_push_lock);
 	up_write(&cil->xc_ctx_lock);
 
@@ -855,13 +882,15 @@ restart:
 	 * Hence by the time we have got here it our sequence may not have been
 	 * pushed yet. This is true if the current sequence still matches the
 	 * push sequence after the above wait loop and the CIL still contains
-	 * dirty objects.
+	 * dirty objects. This is guaranteed by the push code first adding the
+	 * context to the committing list before emptying the CIL.
 	 *
-	 * When the push occurs, it will empty the CIL and atomically increment
-	 * the currect sequence past the push sequence and move it into the
-	 * committing list. Of course, if the CIL is clean at the time of the
-	 * push, it won't have pushed the CIL at all, so in that case we should
-	 * try the push for this sequence again from the start just in case.
+	 * Hence if we don't find the context in the committing list and the
+	 * current sequence number is unchanged then the CIL contents are
+	 * significant.  If the CIL is empty, if means there was nothing to push
+	 * and that means there is nothing to wait for. If the CIL is not empty,
+	 * it means we haven't yet started the push, because if it had started
+	 * we would have found the context on the committing list.
 	 */
 	if (sequence == cil->xc_current_sequence &&
 	    !list_empty(&cil->xc_cil)) {
-- 
cgit v0.10.2


From fb040131561a6b34cefb92cdf598218104abeda0 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 23 Sep 2014 16:05:32 +1000
Subject: xfs: don't ASSERT on corrupt ftype

xfs_dir3_data_get_ftype() gets the file type off disk, but ASSERTs
if it's invalid:

     ASSERT(type < XFS_DIR3_FT_MAX);

We shouldn't ASSERT on bad values read from disk.  V3 dirs are
CRC-protected, but V2 dirs + ftype are not.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52..7e42fdf 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype(
 {
 	__uint8_t	ftype = dep->name[dep->namelen];
 
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
 	if (ftype >= XFS_DIR3_FT_MAX)
 		return XFS_DIR3_FT_UNKNOWN;
 	return ftype;
-- 
cgit v0.10.2


From e3cf17962a757e59fed2cbcbda6373c1b35a35dd Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Tue, 23 Sep 2014 16:05:55 +1000
Subject: xfs: remove second xfs_quota.h inclusion in xfs_icache.c

xfs_quota.h was included twice.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 981b2cf..b45f7b2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,7 +33,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
-#include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 
-- 
cgit v0.10.2


From ea95961df714f7fc446aa4bedfc61510ed1b59cc Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Tue, 23 Sep 2014 16:11:43 +1000
Subject: xfs: xfs_rtget_summary can be static

Fix sparse warning introduced by commit afabfd3 ("xfs: combine
xfs_rtmodify_summary and xfs_rtget_summary").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d1160cc..d45aebe 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
  * Keeps track of a current summary block, so we don't keep reading
  * it from the buffer cache.
  */
-int
+static int
 xfs_rtget_summary(
 	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
-- 
cgit v0.10.2


From 02cc18764c753befcdc163d1bc668a6599a54585 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 23 Sep 2014 16:15:45 +1000
Subject: xfs: xfs_buf_write_fail_rl_state can be static

Fix sparse warning introduced by commit ac8809f9 ("xfs: abort
metadata writeback on permanent errors").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 76007de..30fa5db 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -501,7 +501,7 @@ xfs_buf_item_unpin(
  * buffer being bad..
  */
 
-DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
 
 STATIC uint
 xfs_buf_item_push(
-- 
cgit v0.10.2


From 7abbb8f928e5b7cea1edd077131b2ace665c6712 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 23 Sep 2014 16:20:11 +1000
Subject: xfs: xfs_swap_extent_flush can be static

Fix sparse warning introduced by commit 4ef897a ("xfs: flush both
inodes in xfs_swap_extents").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d..1cb345e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1618,7 +1618,7 @@ xfs_swap_extents_check_format(
 	return 0;
 }
 
-int
+static int
 xfs_swap_extent_flush(
 	struct xfs_inode	*ip)
 {
-- 
cgit v0.10.2


From 2ebff7bbd785c86e12956388b9e6f6bb8ea5d21e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 23 Sep 2014 22:55:00 +1000
Subject: xfs: flush entire last page of old EOF on truncate up

On a sub-page sized filesystem, truncating a mapped region down
leaves us in a world of hurt. We truncate the pagecache, zeroing the
newly unused tail, then punch blocks out from under the page. If we
then truncate the file back up immediately, we expose that unmapped
hole to a dirty page mapped into the user application, and that's
where it all goes wrong.

In truncating the page cache, we avoid unmapping the tail page of
the cache because it still contains valid data. The problem is that
it also contains a hole after the truncate, but nobody told the mm
subsystem that. Therefore, if the page is dirty before the truncate,
we'll never get a .page_mkwrite callout after we extend the file and
the application writes data into the hole on the page.  Hence when
we come to writing that region of the page, it has no blocks and no
delayed allocation reservation and hence we toss the data away.

This patch adds code to the truncate up case to solve it, by
ensuring the partial page at the old EOF is always cleaned after we
do any zeroing and move the EOF upwards. We can't actually serialise
the page writeback and truncate against page faults (yes, that
problem AGAIN) so this is really just a best effort and assumes it
is extremely unlikely that someone is concurrently writing to the
page at the EOF while extending the file.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 7212949..ec6dcdc 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -849,6 +849,36 @@ xfs_setattr_size(
 		return error;
 	truncate_setsize(inode, newsize);
 
+	/*
+	 * The "we can't serialise against page faults" pain gets worse.
+	 *
+	 * If the file is mapped then we have to clean the page at the old EOF
+	 * when extending the file. Extending the file can expose changes the
+	 * underlying page mapping (e.g. from beyond EOF to a hole or
+	 * unwritten), and so on the next attempt to write to that page we need
+	 * to remap it for write. i.e. we need .page_mkwrite() to be called.
+	 * Hence we need to clean the page to clean the pte and so a new write
+	 * fault will be triggered appropriately.
+	 *
+	 * If we do it before we change the inode size, then we can race with a
+	 * page fault that maps the page with exactly the same problem. If we do
+	 * it after we change the file size, then a new page fault can come in
+	 * and allocate space before we've run the rest of the truncate
+	 * transaction. That's kinda grotesque, but it's better than have data
+	 * over a hole, and so that's the lesser evil that has been chosen here.
+	 *
+	 * The real solution, however, is to have some mechanism for locking out
+	 * page faults while a truncate is in progress.
+	 */
+	if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
+		error = filemap_write_and_wait_range(
+				VFS_I(ip)->i_mapping,
+				round_down(oldsize, PAGE_CACHE_SIZE),
+				round_up(oldsize, PAGE_CACHE_SIZE) - 1);
+		if (error)
+			return error;
+	}
+
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error)
-- 
cgit v0.10.2


From eeb1168810d8a140f6834f8c4975f7bb3277d790 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 09:45:03 +1000
Subject: xfs: refactor xlog_recover_process_data()

Clean up xlog_recover_process_data() structure in preparation for
fixing the allocation and freeing context of the transaction being
recovered.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fd5787..8105b85 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3535,12 +3535,124 @@ out:
 }
 
 STATIC int
-xlog_recover_unmount_trans(
-	struct xlog		*log)
+xlog_recovery_process_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	unsigned int		len,
+	unsigned int		flags,
+	int			pass)
 {
-	/* Do nothing now */
-	xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
-	return 0;
+	int			error = -EIO;
+
+	/* mask off ophdr transaction container flags */
+	flags &= ~XLOG_END_TRANS;
+	if (flags & XLOG_WAS_CONT_TRANS)
+		flags &= ~XLOG_CONTINUE_TRANS;
+
+	switch (flags) {
+	/* expected flag values */
+	case 0:
+	case XLOG_CONTINUE_TRANS:
+		error = xlog_recover_add_to_trans(log, trans, dp, len);
+		break;
+	case XLOG_WAS_CONT_TRANS:
+		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
+		break;
+	case XLOG_COMMIT_TRANS:
+		error = xlog_recover_commit_trans(log, trans, pass);
+		break;
+
+	/* unexpected flag values */
+	case XLOG_UNMOUNT_TRANS:
+		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+		error = 0; /* just skip trans */
+		break;
+	case XLOG_START_TRANS:
+		xfs_warn(log->l_mp, "%s: bad transaction", __func__);
+		ASSERT(0);
+		break;
+	default:
+		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
+		ASSERT(0);
+		break;
+	}
+	return error;
+}
+
+STATIC struct xlog_recover *
+xlog_recover_ophdr_to_trans(
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead)
+{
+	struct xlog_recover	*trans;
+	xlog_tid_t		tid;
+	struct hlist_head	*rhp;
+
+	tid = be32_to_cpu(ohead->oh_tid);
+	rhp = &rhash[XLOG_RHASH(tid)];
+	trans = xlog_recover_find_tid(rhp, tid);
+	if (trans)
+		return trans;
+
+	/*
+	 * If this is a new transaction, the ophdr only contains the
+	 * start record. In that case, the only processing we need to do
+	 * on this opheader is allocate a new recovery container to hold
+	 * the recovery ops that will follow.
+	 */
+	if (ohead->oh_flags & XLOG_START_TRANS) {
+		ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+		xlog_recover_new_tid(rhp, tid, be64_to_cpu(rhead->h_lsn));
+	}
+	return NULL;
+}
+
+STATIC int
+xlog_recover_process_ophdr(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
+	struct xlog_rec_header	*rhead,
+	struct xlog_op_header	*ohead,
+	xfs_caddr_t		dp,
+	xfs_caddr_t		end,
+	int			pass)
+{
+	struct xlog_recover	*trans;
+	int			error;
+	unsigned int		len;
+
+	/* Do we understand who wrote this op? */
+	if (ohead->oh_clientid != XFS_TRANSACTION &&
+	    ohead->oh_clientid != XFS_LOG) {
+		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+			__func__, ohead->oh_clientid);
+		ASSERT(0);
+		return -EIO;
+	}
+
+	/*
+	 * Check the ophdr contains all the data it is supposed to contain.
+	 */
+	len = be32_to_cpu(ohead->oh_len);
+	if (dp + len > end) {
+		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
+	if (!trans) {
+		/* nothing to do, so skip over this ophdr */
+		return 0;
+	}
+
+	error = xlog_recovery_process_trans(log, trans, dp, len,
+					    ohead->oh_flags, pass);
+	if (error)
+		xlog_recover_free_trans(trans);
+	return error;
 }
 
 /*
@@ -3560,86 +3672,30 @@ xlog_recover_process_data(
 	xfs_caddr_t		dp,
 	int			pass)
 {
-	xfs_caddr_t		lp;
+	struct xlog_op_header	*ohead;
+	xfs_caddr_t		end;
 	int			num_logops;
-	xlog_op_header_t	*ohead;
-	xlog_recover_t		*trans;
-	xlog_tid_t		tid;
 	int			error;
-	unsigned long		hash;
-	uint			flags;
 
-	lp = dp + be32_to_cpu(rhead->h_len);
+	end = dp + be32_to_cpu(rhead->h_len);
 	num_logops = be32_to_cpu(rhead->h_num_logops);
 
 	/* check the log format matches our own - else we can't recover */
 	if (xlog_header_check_recover(log->l_mp, rhead))
 		return -EIO;
 
-	while ((dp < lp) && num_logops) {
-		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
-		ohead = (xlog_op_header_t *)dp;
-		dp += sizeof(xlog_op_header_t);
-		if (ohead->oh_clientid != XFS_TRANSACTION &&
-		    ohead->oh_clientid != XFS_LOG) {
-			xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
-					__func__, ohead->oh_clientid);
-			ASSERT(0);
-			return -EIO;
-		}
-		tid = be32_to_cpu(ohead->oh_tid);
-		hash = XLOG_RHASH(tid);
-		trans = xlog_recover_find_tid(&rhash[hash], tid);
-		if (trans == NULL) {		   /* not found; add new tid */
-			if (ohead->oh_flags & XLOG_START_TRANS)
-				xlog_recover_new_tid(&rhash[hash], tid,
-					be64_to_cpu(rhead->h_lsn));
-		} else {
-			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-				xfs_warn(log->l_mp, "%s: bad length 0x%x",
-					__func__, be32_to_cpu(ohead->oh_len));
-				WARN_ON(1);
-				return -EIO;
-			}
-			flags = ohead->oh_flags & ~XLOG_END_TRANS;
-			if (flags & XLOG_WAS_CONT_TRANS)
-				flags &= ~XLOG_CONTINUE_TRANS;
-			switch (flags) {
-			case XLOG_COMMIT_TRANS:
-				error = xlog_recover_commit_trans(log,
-								trans, pass);
-				break;
-			case XLOG_UNMOUNT_TRANS:
-				error = xlog_recover_unmount_trans(log);
-				break;
-			case XLOG_WAS_CONT_TRANS:
-				error = xlog_recover_add_to_cont_trans(log,
-						trans, dp,
-						be32_to_cpu(ohead->oh_len));
-				break;
-			case XLOG_START_TRANS:
-				xfs_warn(log->l_mp, "%s: bad transaction",
-					__func__);
-				ASSERT(0);
-				error = -EIO;
-				break;
-			case 0:
-			case XLOG_CONTINUE_TRANS:
-				error = xlog_recover_add_to_trans(log, trans,
-						dp, be32_to_cpu(ohead->oh_len));
-				break;
-			default:
-				xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-					__func__, flags);
-				ASSERT(0);
-				error = -EIO;
-				break;
-			}
-			if (error) {
-				xlog_recover_free_trans(trans);
-				return error;
-			}
-		}
+	while ((dp < end) && num_logops) {
+
+		ohead = (struct xlog_op_header *)dp;
+		dp += sizeof(*ohead);
+		ASSERT(dp <= end);
+
+		/* errors will abort recovery */
+		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
+						    dp, end, pass);
+		if (error)
+			return error;
+
 		dp += be32_to_cpu(ohead->oh_len);
 		num_logops--;
 	}
-- 
cgit v0.10.2


From e9131e50f9d0a632e3011d73f283ba69be0cc682 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 09:45:18 +1000
Subject: xfs: recovery of XLOG_UNMOUNT_TRANS leaks memory

The XLOG_UNMOUNT_TRANS case skips the transaction, despite the fact
an unmount record is always in a standalone transaction. Hence
whenever we come across one of these we need to free the transaction
structure associated with it as there is no commit record that
follows it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8105b85..6d1c783 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3534,6 +3534,9 @@ out:
 	return error ? error : error2;
 }
 
+/*
+ * On error or completion, trans is freed.
+ */
 STATIC int
 xlog_recovery_process_trans(
 	struct xlog		*log,
@@ -3543,7 +3546,8 @@ xlog_recovery_process_trans(
 	unsigned int		flags,
 	int			pass)
 {
-	int			error = -EIO;
+	int			error = 0;
+	bool			freeit = false;
 
 	/* mask off ophdr transaction container flags */
 	flags &= ~XLOG_END_TRANS;
@@ -3565,18 +3569,19 @@ xlog_recovery_process_trans(
 
 	/* unexpected flag values */
 	case XLOG_UNMOUNT_TRANS:
+		/* just skip trans */
 		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
-		error = 0; /* just skip trans */
+		freeit = true;
 		break;
 	case XLOG_START_TRANS:
-		xfs_warn(log->l_mp, "%s: bad transaction", __func__);
-		ASSERT(0);
-		break;
 	default:
 		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
 		ASSERT(0);
+		error = -EIO;
 		break;
 	}
+	if (error || freeit)
+		xlog_recover_free_trans(trans);
 	return error;
 }
 
@@ -3620,7 +3625,6 @@ xlog_recover_process_ophdr(
 	int			pass)
 {
 	struct xlog_recover	*trans;
-	int			error;
 	unsigned int		len;
 
 	/* Do we understand who wrote this op? */
@@ -3648,11 +3652,8 @@ xlog_recover_process_ophdr(
 		return 0;
 	}
 
-	error = xlog_recovery_process_trans(log, trans, dp, len,
-					    ohead->oh_flags, pass);
-	if (error)
-		xlog_recover_free_trans(trans);
-	return error;
+	return xlog_recovery_process_trans(log, trans, dp, len,
+					   ohead->oh_flags, pass);
 }
 
 /*
-- 
cgit v0.10.2


From 88b863db97a18a04c90ebd57d84e1b7863114dcb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 09:45:32 +1000
Subject: xfs: fix double free in xlog_recover_commit_trans

When an error occurs during buffer submission in
xlog_recover_commit_trans(), we free the trans structure twice. Fix
it by only freeing the structure in the caller regardless of the
success or failure of the function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6d1c783..89574ff 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3528,8 +3528,6 @@ out:
 	if (!list_empty(&done_list))
 		list_splice_init(&done_list, &trans->r_itemq);
 
-	xlog_recover_free_trans(trans);
-
 	error2 = xfs_buf_delwri_submit(&buffer_list);
 	return error ? error : error2;
 }
@@ -3554,6 +3552,10 @@ xlog_recovery_process_trans(
 	if (flags & XLOG_WAS_CONT_TRANS)
 		flags &= ~XLOG_CONTINUE_TRANS;
 
+	/*
+	 * Callees must not free the trans structure. We'll decide if we need to
+	 * free it or not based on the operation being done and it's result.
+	 */
 	switch (flags) {
 	/* expected flag values */
 	case 0:
@@ -3565,6 +3567,8 @@ xlog_recovery_process_trans(
 		break;
 	case XLOG_COMMIT_TRANS:
 		error = xlog_recover_commit_trans(log, trans, pass);
+		/* success or fail, we are now done with this transaction. */
+		freeit = true;
 		break;
 
 	/* unexpected flag values */
-- 
cgit v0.10.2


From 76560669868d3b4d650d91d9bf467a8d81171766 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 09:45:42 +1000
Subject: xfs: reorganise transaction recovery item code

The code for managing transactions anf the items for recovery is
spread across 3 different locations in the file. Move them all
together so that it is easy to read the code without needing to jump
long distances in the file.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 89574ff..6af9e1a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1445,160 +1445,6 @@ xlog_clear_stale_blocks(
  ******************************************************************************
  */
 
-STATIC xlog_recover_t *
-xlog_recover_find_tid(
-	struct hlist_head	*head,
-	xlog_tid_t		tid)
-{
-	xlog_recover_t		*trans;
-
-	hlist_for_each_entry(trans, head, r_list) {
-		if (trans->r_log_tid == tid)
-			return trans;
-	}
-	return NULL;
-}
-
-STATIC void
-xlog_recover_new_tid(
-	struct hlist_head	*head,
-	xlog_tid_t		tid,
-	xfs_lsn_t		lsn)
-{
-	xlog_recover_t		*trans;
-
-	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-	trans->r_log_tid   = tid;
-	trans->r_lsn	   = lsn;
-	INIT_LIST_HEAD(&trans->r_itemq);
-
-	INIT_HLIST_NODE(&trans->r_list);
-	hlist_add_head(&trans->r_list, head);
-}
-
-STATIC void
-xlog_recover_add_item(
-	struct list_head	*head)
-{
-	xlog_recover_item_t	*item;
-
-	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-	INIT_LIST_HEAD(&item->ri_list);
-	list_add_tail(&item->ri_list, head);
-}
-
-STATIC int
-xlog_recover_add_to_cont_trans(
-	struct xlog		*log,
-	struct xlog_recover	*trans,
-	xfs_caddr_t		dp,
-	int			len)
-{
-	xlog_recover_item_t	*item;
-	xfs_caddr_t		ptr, old_ptr;
-	int			old_len;
-
-	if (list_empty(&trans->r_itemq)) {
-		/* finish copying rest of trans header */
-		xlog_recover_add_item(&trans->r_itemq);
-		ptr = (xfs_caddr_t) &trans->r_theader +
-				sizeof(xfs_trans_header_t) - len;
-		memcpy(ptr, dp, len); /* d, s, l */
-		return 0;
-	}
-	/* take the tail entry */
-	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
-
-	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
-	old_len = item->ri_buf[item->ri_cnt-1].i_len;
-
-	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
-	memcpy(&ptr[old_len], dp, len); /* d, s, l */
-	item->ri_buf[item->ri_cnt-1].i_len += len;
-	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
-	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
-	return 0;
-}
-
-/*
- * The next region to add is the start of a new region.  It could be
- * a whole region or it could be the first part of a new region.  Because
- * of this, the assumption here is that the type and size fields of all
- * format structures fit into the first 32 bits of the structure.
- *
- * This works because all regions must be 32 bit aligned.  Therefore, we
- * either have both fields or we have neither field.  In the case we have
- * neither field, the data part of the region is zero length.  We only have
- * a log_op_header and can throw away the header since a new one will appear
- * later.  If we have at least 4 bytes, then we can determine how many regions
- * will appear in the current log item.
- */
-STATIC int
-xlog_recover_add_to_trans(
-	struct xlog		*log,
-	struct xlog_recover	*trans,
-	xfs_caddr_t		dp,
-	int			len)
-{
-	xfs_inode_log_format_t	*in_f;			/* any will do */
-	xlog_recover_item_t	*item;
-	xfs_caddr_t		ptr;
-
-	if (!len)
-		return 0;
-	if (list_empty(&trans->r_itemq)) {
-		/* we need to catch log corruptions here */
-		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-			xfs_warn(log->l_mp, "%s: bad header magic number",
-				__func__);
-			ASSERT(0);
-			return -EIO;
-		}
-		if (len == sizeof(xfs_trans_header_t))
-			xlog_recover_add_item(&trans->r_itemq);
-		memcpy(&trans->r_theader, dp, len); /* d, s, l */
-		return 0;
-	}
-
-	ptr = kmem_alloc(len, KM_SLEEP);
-	memcpy(ptr, dp, len);
-	in_f = (xfs_inode_log_format_t *)ptr;
-
-	/* take the tail entry */
-	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
-	if (item->ri_total != 0 &&
-	     item->ri_total == item->ri_cnt) {
-		/* tail item is in use, get a new one */
-		xlog_recover_add_item(&trans->r_itemq);
-		item = list_entry(trans->r_itemq.prev,
-					xlog_recover_item_t, ri_list);
-	}
-
-	if (item->ri_total == 0) {		/* first region to be added */
-		if (in_f->ilf_size == 0 ||
-		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-			xfs_warn(log->l_mp,
-		"bad number of regions (%d) in inode log format",
-				  in_f->ilf_size);
-			ASSERT(0);
-			kmem_free(ptr);
-			return -EIO;
-		}
-
-		item->ri_total = in_f->ilf_size;
-		item->ri_buf =
-			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-				    KM_SLEEP);
-	}
-	ASSERT(item->ri_total > item->ri_cnt);
-	/* Description region is ri_buf[0] */
-	item->ri_buf[item->ri_cnt].i_addr = ptr;
-	item->ri_buf[item->ri_cnt].i_len  = len;
-	item->ri_cnt++;
-	trace_xfs_log_recover_item_add(log, trans, item, 0);
-	return 0;
-}
-
 /*
  * Sort the log items in the transaction.
  *
@@ -3254,31 +3100,6 @@ xlog_recover_do_icreate_pass2(
 	return 0;
 }
 
-/*
- * Free up any resources allocated by the transaction
- *
- * Remember that EFIs, EFDs, and IUNLINKs are handled later.
- */
-STATIC void
-xlog_recover_free_trans(
-	struct xlog_recover	*trans)
-{
-	xlog_recover_item_t	*item, *n;
-	int			i;
-
-	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
-		/* Free the regions in the item. */
-		list_del(&item->ri_list);
-		for (i = 0; i < item->ri_cnt; i++)
-			kmem_free(item->ri_buf[i].i_addr);
-		/* Free the item itself */
-		kmem_free(item->ri_buf);
-		kmem_free(item);
-	}
-	/* Free the transaction recover structure */
-	kmem_free(trans);
-}
-
 STATIC void
 xlog_recover_buffer_ra_pass2(
 	struct xlog                     *log,
@@ -3532,6 +3353,185 @@ out:
 	return error ? error : error2;
 }
 
+
+STATIC xlog_recover_t *
+xlog_recover_find_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid)
+{
+	xlog_recover_t		*trans;
+
+	hlist_for_each_entry(trans, head, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
+	}
+	return NULL;
+}
+
+STATIC void
+xlog_recover_new_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid,
+	xfs_lsn_t		lsn)
+{
+	xlog_recover_t		*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	INIT_LIST_HEAD(&trans->r_itemq);
+
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, head);
+}
+
+STATIC void
+xlog_recover_add_item(
+	struct list_head	*head)
+{
+	xlog_recover_item_t	*item;
+
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
+}
+
+STATIC int
+xlog_recover_add_to_cont_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr, old_ptr;
+	int			old_len;
+
+	if (list_empty(&trans->r_itemq)) {
+		/* finish copying rest of trans header */
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (xfs_caddr_t) &trans->r_theader +
+				sizeof(xfs_trans_header_t) - len;
+		memcpy(ptr, dp, len);
+		return 0;
+	}
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+	memcpy(&ptr[old_len], dp, len);
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+	struct xlog		*log,
+	struct xlog_recover	*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xfs_inode_log_format_t	*in_f;			/* any will do */
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr;
+
+	if (!len)
+		return 0;
+	if (list_empty(&trans->r_itemq)) {
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xfs_warn(log->l_mp, "%s: bad header magic number",
+				__func__);
+			ASSERT(0);
+			return -EIO;
+		}
+		if (len == sizeof(xfs_trans_header_t))
+			xlog_recover_add_item(&trans->r_itemq);
+		memcpy(&trans->r_theader, dp, len);
+		return 0;
+	}
+
+	ptr = kmem_alloc(len, KM_SLEEP);
+	memcpy(ptr, dp, len);
+	in_f = (xfs_inode_log_format_t *)ptr;
+
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
+		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					xlog_recover_item_t, ri_list);
+	}
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xfs_warn(log->l_mp,
+		"bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			kmem_free(ptr);
+			return -EIO;
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    KM_SLEEP);
+	}
+	ASSERT(item->ri_total > item->ri_cnt);
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	trace_xfs_log_recover_item_add(log, trans, item, 0);
+	return 0;
+}
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+	struct xlog_recover	*trans)
+{
+	xlog_recover_item_t	*item, *n;
+	int			i;
+
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
+		/* Free the item itself */
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
+	/* Free the transaction recover structure */
+	kmem_free(trans);
+}
+
 /*
  * On error or completion, trans is freed.
  */
-- 
cgit v0.10.2


From b818cca1976d1a01754033ac08724e05d07cce8f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 09:45:54 +1000
Subject: xfs: refactor recovery transaction start handling

Rework the transaction lookup and allocation code in
xlog_recovery_process_ophdr() to fold two related call-once
helper functions into a single helper. Then fold in all the
XLOG_START_TRANS logic to that helper to clean up the remaining
logic in xlog_recovery_process_ophdr().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6af9e1a..5019f52 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3353,38 +3353,6 @@ out:
 	return error ? error : error2;
 }
 
-
-STATIC xlog_recover_t *
-xlog_recover_find_tid(
-	struct hlist_head	*head,
-	xlog_tid_t		tid)
-{
-	xlog_recover_t		*trans;
-
-	hlist_for_each_entry(trans, head, r_list) {
-		if (trans->r_log_tid == tid)
-			return trans;
-	}
-	return NULL;
-}
-
-STATIC void
-xlog_recover_new_tid(
-	struct hlist_head	*head,
-	xlog_tid_t		tid,
-	xfs_lsn_t		lsn)
-{
-	xlog_recover_t		*trans;
-
-	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-	trans->r_log_tid   = tid;
-	trans->r_lsn	   = lsn;
-	INIT_LIST_HEAD(&trans->r_itemq);
-
-	INIT_HLIST_NODE(&trans->r_list);
-	hlist_add_head(&trans->r_list, head);
-}
-
 STATIC void
 xlog_recover_add_item(
 	struct list_head	*head)
@@ -3507,6 +3475,7 @@ xlog_recover_add_to_trans(
 	trace_xfs_log_recover_item_add(log, trans, item, 0);
 	return 0;
 }
+
 /*
  * Free up any resources allocated by the transaction
  *
@@ -3589,6 +3558,13 @@ xlog_recovery_process_trans(
 	return error;
 }
 
+/*
+ * Lookup the transaction recovery structure associated with the ID in the
+ * current ophdr. If the transaction doesn't exist and the start flag is set in
+ * the ophdr, then allocate a new transaction for future ID matches to find.
+ * Either way, return what we found during the lookup - an existing transaction
+ * or nothing.
+ */
 STATIC struct xlog_recover *
 xlog_recover_ophdr_to_trans(
 	struct hlist_head	rhash[],
@@ -3601,20 +3577,35 @@ xlog_recover_ophdr_to_trans(
 
 	tid = be32_to_cpu(ohead->oh_tid);
 	rhp = &rhash[XLOG_RHASH(tid)];
-	trans = xlog_recover_find_tid(rhp, tid);
-	if (trans)
-		return trans;
+	hlist_for_each_entry(trans, rhp, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
+	}
 
 	/*
-	 * If this is a new transaction, the ophdr only contains the
-	 * start record. In that case, the only processing we need to do
-	 * on this opheader is allocate a new recovery container to hold
-	 * the recovery ops that will follow.
+	 * skip over non-start transaction headers - we could be
+	 * processing slack space before the next transaction starts
+	 */
+	if (!(ohead->oh_flags & XLOG_START_TRANS))
+		return NULL;
+
+	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+
+	/*
+	 * This is a new transaction so allocate a new recovery container to
+	 * hold the recovery ops that will follow.
+	 */
+	trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+	trans->r_log_tid = tid;
+	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
+	INIT_LIST_HEAD(&trans->r_itemq);
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, rhp);
+
+	/*
+	 * Nothing more to do for this ophdr. Items to be added to this new
+	 * transaction will be in subsequent ophdr containers.
 	 */
-	if (ohead->oh_flags & XLOG_START_TRANS) {
-		ASSERT(be32_to_cpu(ohead->oh_len) == 0);
-		xlog_recover_new_tid(rhp, tid, be64_to_cpu(rhead->h_lsn));
-	}
 	return NULL;
 }
 
-- 
cgit v0.10.2


From e68ed77521f695d165cbae070f6dda8a4778438f Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 29 Sep 2014 10:43:15 +1000
Subject: xfs: fix use of agi_newino in finobt lookup

Sparse warns that we are passing the big-endian valueo f agi_newino
to the initial btree lookup function when trying to find a new
inode. This is wrong - we need to pass the host order value, not the
disk order value. This will adversely affect the next inode
allocated, but given that the free inode btree is usually much
smaller than the allocated inode btree it is much less likely to be
a performance issue if we start the search in the wrong place.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d213a2e..23dcb72 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino(
 	int i;
 
 	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-		error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
-					 &i);
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
 		if (error)
 			return error;
 		if (i == 1) {
@@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino(
 			if (error)
 				return error;
 			XFS_WANT_CORRUPTED_RETURN(i == 1);
-
 			return 0;
 		}
 	}
-- 
cgit v0.10.2


From bf1ed3833078e3bb0ba8cd03468090b9359d0912 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 10:43:40 +1000
Subject: xfs: xfs_qm_dquot_isolate needs locking annotations for sparse

To remove noise from the build.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 1023210..d68f230 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -434,6 +434,7 @@ xfs_qm_dquot_isolate(
 	struct list_head	*item,
 	spinlock_t		*lru_lock,
 	void			*arg)
+		__releases(lru_lock) __acquires(lru_lock)
 {
 	struct xfs_dquot	*dqp = container_of(item,
 						struct xfs_dquot, q_lru);
-- 
cgit v0.10.2


From e3aed1a08190c038c4ea41b73ea6f07bc0e3290c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 10:46:08 +1000
Subject: xfs: xfs_kset should be static

As it is accessed through the struct xfs_mount and can be set up
entirely from fs/xfs/xfs_super.c

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fbf0384..d36bdbc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -61,8 +61,6 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
 
-extern struct kset *xfs_kset;
-
 /*
  * See if the UUID is unique among mounted XFS filesystems.
  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -729,7 +727,6 @@ xfs_mountfs(
 
 	xfs_set_maxicount(mp);
 
-	mp->m_kobj.kobject.kset = xfs_kset;
 	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
 	if (error)
 		goto out;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index dcd4b93..9f622fe 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -63,7 +63,7 @@ static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
 
-struct kset *xfs_kset;			/* top-level xfs sysfs dir */
+static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 #ifdef DEBUG
 static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #endif
@@ -1411,6 +1411,7 @@ xfs_fs_fill_super(
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+	mp->m_kobj.kobject.kset = xfs_kset;
 
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
-- 
cgit v0.10.2


From b972d0797180d8414351d9dc8ff65071c692d058 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 29 Sep 2014 10:46:22 +1000
Subject: xfs: annotate user variables passed as void

Some argument callbacks can contain user buffers, and sparse warns
about passing them as void pointers. Cast appropriately to remove
the sparse warnings.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3799695..7a6b406 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1349,7 +1349,7 @@ xfs_ioc_setxflags(
 STATIC int
 xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 {
-	struct getbmap __user	*base = *ap;
+	struct getbmap __user	*base = (struct getbmap __user *)*ap;
 
 	/* copy only getbmap portion (not getbmapx) */
 	if (copy_to_user(base, bmv, sizeof(struct getbmap)))
@@ -1380,7 +1380,7 @@ xfs_ioc_getbmap(
 		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
 
 	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
-			    (struct getbmap *)arg+1);
+			    (__force struct getbmap *)arg+1);
 	if (error)
 		return error;
 
@@ -1393,7 +1393,7 @@ xfs_ioc_getbmap(
 STATIC int
 xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
 {
-	struct getbmapx __user	*base = *ap;
+	struct getbmapx __user	*base = (struct getbmapx __user *)*ap;
 
 	if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
 		return -EFAULT;
@@ -1420,7 +1420,7 @@ xfs_ioc_getbmapx(
 		return -EINVAL;
 
 	error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
-			    (struct getbmapx *)arg+1);
+			    (__force struct getbmapx *)arg+1);
 	if (error)
 		return error;
 
-- 
cgit v0.10.2


From a870fe6dfaba1cc67424cde4cfd2cd3eee62bf35 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:02:28 +1000
Subject: xfs: force the log before shutting down

When we have marked the filesystem for shutdown, we want to prevent
any further buffer IO from being submitted. However, we currently
force the log after marking the filesystem as shut down, hence
allowing IO to the log *after* we have marked both the filesystem
and the log as in an error state.

Clean this up by forcing the log before we mark the filesytem with
an error. This replaces the pure CIL flush that we currently have
which works around this same issue (i.e the CIL can't be flushed
once the shutdown flags are set) and hence enables us to clean up
the logic substantially.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ca4fd5b..85f36f2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3867,18 +3867,17 @@ xlog_state_ioerror(
  * This is called from xfs_force_shutdown, when we're forcibly
  * shutting down the filesystem, typically because of an IO error.
  * Our main objectives here are to make sure that:
- *	a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *	a. if !logerror, flush the logs to disk. Anything modified
+ *	   after this is ignored.
+ *	b. the filesystem gets marked 'SHUTDOWN' for all interested
  *	   parties to find out, 'atomically'.
- *	b. those who're sleeping on log reservations, pinned objects and
+ *	c. those who're sleeping on log reservations, pinned objects and
  *	    other resources get woken up, and be told the bad news.
- *	c. nothing new gets queued up after (a) and (b) are done.
- *	d. if !logerror, flush the iclogs to disk, then seal them off
- *	   for business.
+ *	d. nothing new gets queued up after (b) and (c) are done.
  *
- * Note: for delayed logging the !logerror case needs to flush the regions
- * held in memory out to the iclogs before flushing them to disk. This needs
- * to be done before the log is marked as shutdown, otherwise the flush to the
- * iclogs will fail.
+ * Note: for the !logerror case we need to flush the regions held in memory out
+ * to disk first. This needs to be done before the log is marked as shutdown,
+ * otherwise the iclog writes will fail.
  */
 int
 xfs_log_force_umount(
@@ -3910,16 +3909,16 @@ xfs_log_force_umount(
 		ASSERT(XLOG_FORCED_SHUTDOWN(log));
 		return 1;
 	}
-	retval = 0;
 
 	/*
-	 * Flush the in memory commit item list before marking the log as
-	 * being shut down. We need to do it in this order to ensure all the
-	 * completed transactions are flushed to disk with the xfs_log_force()
-	 * call below.
+	 * Flush all the completed transactions to disk before marking the log
+	 * being shut down. We need to do it in this order to ensure that
+	 * completed operations are safely on disk before we shut down, and that
+	 * we don't have to issue any buffer IO after the shutdown flags are set
+	 * to guarantee this.
 	 */
 	if (!logerror)
-		xlog_cil_force(log);
+		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
 
 	/*
 	 * mark the filesystem and the as in a shutdown state and wake
@@ -3931,18 +3930,11 @@ xfs_log_force_umount(
 		XFS_BUF_DONE(mp->m_sb_bp);
 
 	/*
-	 * This flag is sort of redundant because of the mount flag, but
-	 * it's good to maintain the separation between the log and the rest
-	 * of XFS.
+	 * Mark the log and the iclogs with IO error flags to prevent any
+	 * further log IO from being issued or completed.
 	 */
 	log->l_flags |= XLOG_IO_ERROR;
-
-	/*
-	 * If we hit a log error, we want to mark all the iclogs IOERROR
-	 * while we're still holding the loglock.
-	 */
-	if (logerror)
-		retval = xlog_state_ioerror(log);
+	retval = xlog_state_ioerror(log);
 	spin_unlock(&log->l_icloglock);
 
 	/*
@@ -3955,19 +3947,6 @@ xfs_log_force_umount(
 	xlog_grant_head_wake_all(&log->l_reserve_head);
 	xlog_grant_head_wake_all(&log->l_write_head);
 
-	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
-		ASSERT(!logerror);
-		/*
-		 * Force the incore logs to disk before shutting the
-		 * log down completely.
-		 */
-		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
-
-		spin_lock(&log->l_icloglock);
-		retval = xlog_state_ioerror(log);
-		spin_unlock(&log->l_icloglock);
-	}
-
 	/*
 	 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
 	 * as if the log writes were completed. The abort handling in the log
-- 
cgit v0.10.2


From cf53e99d192171a58791136d33fd3fea5d8bab35 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:01 +1000
Subject: xfs: Don't use xfs_buf_iowait in the delwri buffer code

For the special case of delwri buffer submission and waiting, we
don't need to issue IO synchronously at all. The second pass to call
xfs_buf_iowait() can be replaced with  blocking on xfs_buf_lock() -
the buffer will be unlocked when the async IO is complete.

This formalises a sane the method of waiting for async IO - take an
extra reference, submit the IO, call xfs_buf_lock() when you want to
wait for IO completion. i.e.:

	bp = xfs_buf_find();
	xfs_buf_hold(bp);
	bp->b_flags |= XBF_ASYNC;
	xfs_buf_iosubmit(bp);
	xfs_buf_lock(bp)
	error = bp->b_error;
	....
	xfs_buf_relse(bp);

While this is somewhat racy for gathering IO errors, none of the
code that calls xfs_buf_delwri_submit() will race against other
users of the buffers being submitted. Even if they do, we don't
really care if the error is detected by the delwri code or the user
we raced against. Either way, the error will be detected and
handled.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cd7b8ca..9dc4c22 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1813,12 +1813,17 @@ __xfs_buf_delwri_submit(
 	blk_start_plug(&plug);
 	list_for_each_entry_safe(bp, n, io_list, b_list) {
 		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-		bp->b_flags |= XBF_WRITE;
+		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
 
-		if (!wait) {
-			bp->b_flags |= XBF_ASYNC;
+		/*
+		 * we do all Io submission async. This means if we need to wait
+		 * for IO completion we need to take an extra reference so the
+		 * buffer is still valid on the other side.
+		 */
+		if (wait)
+			xfs_buf_hold(bp);
+		else
 			list_del_init(&bp->b_list);
-		}
 		xfs_bdstrat_cb(bp);
 	}
 	blk_finish_plug(&plug);
@@ -1866,7 +1871,10 @@ xfs_buf_delwri_submit(
 		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
 
 		list_del_init(&bp->b_list);
-		error2 = xfs_buf_iowait(bp);
+
+		/* locking the buffer will wait for async IO completion. */
+		xfs_buf_lock(bp);
+		error2 = bp->b_error;
 		xfs_buf_relse(bp);
 		if (!error)
 			error = error2;
-- 
cgit v0.10.2


From e11bb8052c3f500e66142f33579cc054d691a8fb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:11 +1000
Subject: xfs: synchronous buffer IO needs a reference

When synchronous IO runs IO completion work, it does so without an
IO reference or a hold reference on the buffer. The IO "hold
reference" is owned by the submitter, and released when the
submission is complete. The IO reference is released when both the
submitter and the bio end_io processing is run, and so if the io
completion work is run from IO completion context, it is run without
an IO reference.

Hence we can get the situation where the submitter can submit the
IO, see an error on the buffer and unlock and free the buffer while
there is still IO in progress. This leads to use-after-free and
memory corruption.

Fix this by taking a "sync IO hold" reference that is owned by the
IO and not released until after the buffer completion calls are run
to wake up synchronous waiters. This means that the buffer will not
be freed in any circumstance until all IO processing is completed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9dc4c22..48b1e29 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1019,6 +1019,9 @@ xfs_buf_iodone_work(
 	else {
 		ASSERT(read && bp->b_ops);
 		complete(&bp->b_iowait);
+
+		/* release the !XBF_ASYNC ref now we are done. */
+		xfs_buf_rele(bp);
 	}
 }
 
@@ -1044,6 +1047,7 @@ xfs_buf_ioend(
 	} else {
 		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 		complete(&bp->b_iowait);
+		xfs_buf_rele(bp);
 	}
 }
 
@@ -1086,8 +1090,11 @@ xfs_bioerror(
 	xfs_buf_ioerror(bp, -EIO);
 
 	/*
-	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
+	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For
+	 * sync IO, xfs_buf_ioend is going to remove a ref here.
 	 */
+	if (!(bp->b_flags & XBF_ASYNC))
+		xfs_buf_hold(bp);
 	XFS_BUF_UNREAD(bp);
 	XFS_BUF_UNDONE(bp);
 	xfs_buf_stale(bp);
@@ -1383,22 +1390,48 @@ xfs_buf_iorequest(
 
 	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
+
+	/*
+	 * Take references to the buffer. For XBF_ASYNC buffers, holding a
+	 * reference for as long as submission takes is all that is necessary
+	 * here. The IO inherits the lock and hold count from the submitter,
+	 * and these are release during IO completion processing. Taking a hold
+	 * over submission ensures that the buffer is not freed until we have
+	 * completed all processing, regardless of when IO errors occur or are
+	 * reported.
+	 *
+	 * However, for synchronous IO, the IO does not inherit the submitters
+	 * reference count, nor the buffer lock. Hence we need to take an extra
+	 * reference to the buffer for the for the IO context so that we can
+	 * guarantee the buffer is not freed until all IO completion processing
+	 * is done. Otherwise the caller can drop their reference while the IO
+	 * is still in progress and hence trigger a use-after-free situation.
+	 */
 	xfs_buf_hold(bp);
+	if (!(bp->b_flags & XBF_ASYNC))
+		xfs_buf_hold(bp);
+
 
 	/*
-	 * Set the count to 1 initially, this will stop an I/O
-	 * completion callout which happens before we have started
-	 * all the I/O from calling xfs_buf_ioend too early.
+	 * Set the count to 1 initially, this will stop an I/O completion
+	 * callout which happens before we have started all the I/O from calling
+	 * xfs_buf_ioend too early.
 	 */
 	atomic_set(&bp->b_io_remaining, 1);
 	_xfs_buf_ioapply(bp);
+
 	/*
-	 * If _xfs_buf_ioapply failed, we'll get back here with
-	 * only the reference we took above.  _xfs_buf_ioend will
-	 * drop it to zero, so we'd better not queue it for later,
-	 * or we'll free it before it's done.
+	 * If _xfs_buf_ioapply failed or we are doing synchronous IO that
+	 * completes extremely quickly, we can get back here with only the IO
+	 * reference we took above. _xfs_buf_ioend will drop it to zero. Run
+	 * completion processing synchronously so that we don't return to the
+	 * caller with completion still pending. This avoids unnecessary context
+	 * switches associated with the end_io workqueue.
 	 */
-	_xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+	if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+		_xfs_buf_ioend(bp, 0);
+	else
+		_xfs_buf_ioend(bp, 1);
 
 	xfs_buf_rele(bp);
 }
-- 
cgit v0.10.2


From e8aaba9a783c8e5d2c58ebe69650ea31b91bb745 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:22 +1000
Subject: xfs: xfs_buf_ioend and xfs_buf_iodone_work duplicate functionality

We do some work in xfs_buf_ioend, and some work in
xfs_buf_iodone_work, but much of that functionality is the same.
This work can all be done in a single function, leaving
xfs_buf_iodone just a wrapper to determine if we should execute it
by workqueue or directly. hence rename xfs_buf_iodone_work to
xfs_buf_ioend(), and add a new xfs_buf_ioend_async() for places that
need async processing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 48b1e29..a046149 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -998,26 +998,30 @@ xfs_buf_wait_unpin(
  *	Buffer Utility Routines
  */
 
-STATIC void
-xfs_buf_iodone_work(
-	struct work_struct	*work)
+void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
 {
-	struct xfs_buf		*bp =
-		container_of(work, xfs_buf_t, b_iodone_work);
-	bool			read = !!(bp->b_flags & XBF_READ);
+	bool		read = bp->b_flags & XBF_READ;
+
+	trace_xfs_buf_iodone(bp, _RET_IP_);
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 
-	/* only validate buffers that were read without errors */
-	if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+	/* Only validate buffers that were read without errors */
+	if (read && !bp->b_error && bp->b_ops) {
+		ASSERT(!bp->b_iodone);
 		bp->b_ops->verify_read(bp);
+	}
+
+	if (!bp->b_error)
+		bp->b_flags |= XBF_DONE;
 
 	if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
 	else {
-		ASSERT(read && bp->b_ops);
 		complete(&bp->b_iowait);
 
 		/* release the !XBF_ASYNC ref now we are done. */
@@ -1025,30 +1029,22 @@ xfs_buf_iodone_work(
 	}
 }
 
-void
-xfs_buf_ioend(
-	struct xfs_buf	*bp,
-	int		schedule)
+static void
+xfs_buf_ioend_work(
+	struct work_struct	*work)
 {
-	bool		read = !!(bp->b_flags & XBF_READ);
-
-	trace_xfs_buf_iodone(bp, _RET_IP_);
+	struct xfs_buf		*bp =
+		container_of(work, xfs_buf_t, b_iodone_work);
 
-	if (bp->b_error == 0)
-		bp->b_flags |= XBF_DONE;
+	xfs_buf_ioend(bp);
+}
 
-	if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
-		if (schedule) {
-			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
-			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-		} else {
-			xfs_buf_iodone_work(&bp->b_iodone_work);
-		}
-	} else {
-		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-		complete(&bp->b_iowait);
-		xfs_buf_rele(bp);
-	}
+void
+xfs_buf_ioend_async(
+	struct xfs_buf	*bp)
+{
+	INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+	queue_work(xfslogd_workqueue, &bp->b_iodone_work);
 }
 
 void
@@ -1099,7 +1095,7 @@ xfs_bioerror(
 	XFS_BUF_UNDONE(bp);
 	xfs_buf_stale(bp);
 
-	xfs_buf_ioend(bp, 0);
+	xfs_buf_ioend(bp);
 
 	return -EIO;
 }
@@ -1186,15 +1182,6 @@ xfs_bwrite(
 }
 
 STATIC void
-_xfs_buf_ioend(
-	xfs_buf_t		*bp,
-	int			schedule)
-{
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-		xfs_buf_ioend(bp, schedule);
-}
-
-STATIC void
 xfs_buf_bio_end_io(
 	struct bio		*bio,
 	int			error)
@@ -1211,7 +1198,8 @@ xfs_buf_bio_end_io(
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
 
-	_xfs_buf_ioend(bp, 1);
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend_async(bp);
 	bio_put(bio);
 }
 
@@ -1423,15 +1411,17 @@ xfs_buf_iorequest(
 	/*
 	 * If _xfs_buf_ioapply failed or we are doing synchronous IO that
 	 * completes extremely quickly, we can get back here with only the IO
-	 * reference we took above. _xfs_buf_ioend will drop it to zero. Run
-	 * completion processing synchronously so that we don't return to the
-	 * caller with completion still pending. This avoids unnecessary context
-	 * switches associated with the end_io workqueue.
+	 * reference we took above. If we drop it to zero, run completion
+	 * processing synchronously so that we don't return to the caller with
+	 * completion still pending. This avoids unnecessary context switches
+	 * associated with the end_io workqueue.
 	 */
-	if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
-		_xfs_buf_ioend(bp, 0);
-	else
-		_xfs_buf_ioend(bp, 1);
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+			xfs_buf_ioend(bp);
+		else
+			xfs_buf_ioend_async(bp);
+	}
 
 	xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c753183..4585c15 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -286,7 +286,7 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(xfs_buf_t *,	int);
+extern void xfs_buf_ioend(struct xfs_buf *bp);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
 extern void xfs_buf_iorequest(xfs_buf_t *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 76007de..4fd41b5 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -491,7 +491,7 @@ xfs_buf_item_unpin(
 		xfs_buf_ioerror(bp, -EIO);
 		XFS_BUF_UNDONE(bp);
 		xfs_buf_stale(bp);
-		xfs_buf_ioend(bp, 0);
+		xfs_buf_ioend(bp);
 	}
 }
 
@@ -1115,7 +1115,7 @@ do_callbacks:
 	xfs_buf_do_callbacks(bp);
 	bp->b_fspriv = NULL;
 	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
+	xfs_buf_ioend(bp);
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fea3c92..00d210b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3056,7 +3056,7 @@ cluster_corrupt_out:
 			XFS_BUF_UNDONE(bp);
 			xfs_buf_stale(bp);
 			xfs_buf_ioerror(bp, -EIO);
-			xfs_buf_ioend(bp, 0);
+			xfs_buf_ioend(bp);
 		} else {
 			xfs_buf_stale(bp);
 			xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 85f36f2..3567396 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1678,7 +1678,7 @@ xlog_bdstrat(
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		xfs_buf_ioerror(bp, -EIO);
 		xfs_buf_stale(bp);
-		xfs_buf_ioend(bp, 0);
+		xfs_buf_ioend(bp);
 		/*
 		 * It would seem logical to return EIO here, but we rely on
 		 * the log state machine to propagate I/O errors instead of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fd5787..4ba19bf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -383,7 +383,7 @@ xlog_recover_iodone(
 					SHUTDOWN_META_IO_ERROR);
 	}
 	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
+	xfs_buf_ioend(bp);
 }
 
 /*
-- 
cgit v0.10.2


From 61be9c529a4a715ab8679e9ca82bc3790c7ab66c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:31 +1000
Subject: xfs: rework xfs_buf_bio_endio error handling

Currently the report of a bio error from completion
immediately marks the buffer with an error. The issue is that this
is racy w.r.t. synchronous IO - the submitter can see b_error being
set before the IO is complete, and hence we cannot differentiate
between submission failures and completion failures.

Add an internal b_io_error field protected by the b_lock to catch IO
completion errors, and only propagate that to the buffer during
final IO completion handling. Hence we can tell in xfs_buf_iorequest
if we've had a submission failure bey checking bp->b_error before
dropping our b_io_remaining reference - that reference will prevent
b_io_error values from being propagated to b_error in the event that
completion races with submission.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a046149..170d6c0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1008,6 +1008,13 @@ xfs_buf_ioend(
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 
+	/*
+	 * Pull in IO completion errors now. We are guaranteed to be running
+	 * single threaded, so we don't need the lock to read b_io_error.
+	 */
+	if (!bp->b_error && bp->b_io_error)
+		xfs_buf_ioerror(bp, bp->b_io_error);
+
 	/* Only validate buffers that were read without errors */
 	if (read && !bp->b_error && bp->b_ops) {
 		ASSERT(!bp->b_iodone);
@@ -1192,8 +1199,12 @@ xfs_buf_bio_end_io(
 	 * don't overwrite existing errors - otherwise we can lose errors on
 	 * buffers that require multiple bios to complete.
 	 */
-	if (!bp->b_error)
-		xfs_buf_ioerror(bp, error);
+	if (error) {
+		spin_lock(&bp->b_lock);
+		if (!bp->b_io_error)
+			bp->b_io_error = error;
+		spin_unlock(&bp->b_lock);
+	}
 
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -1379,6 +1390,9 @@ xfs_buf_iorequest(
 	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
 
+	/* clear the internal error state to avoid spurious errors */
+	bp->b_io_error = 0;
+
 	/*
 	 * Take references to the buffer. For XBF_ASYNC buffers, holding a
 	 * reference for as long as submission takes is all that is necessary
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4585c15..44db8cd 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -158,6 +158,7 @@ typedef struct xfs_buf {
 	struct list_head	b_lru;		/* lru list */
 	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_state;	/* internal state flags */
+	int			b_io_error;	/* internal IO error state */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
-- 
cgit v0.10.2


From 8dac39219827113f14e97507646a610ca426b69e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:40 +1000
Subject: xfs: kill xfs_bdstrat_cb

Only has two callers, and is just a shutdown check and error handler
around xfs_buf_iorequest. However, the error handling is a mess of
read and write semantics, and both internal callers only call it for
writes. Hence kill the wrapper, and follow up with a patch to
sanitise the error handling.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 170d6c0..0eee0f1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1146,27 +1146,6 @@ xfs_bioerror_relse(
 	return -EIO;
 }
 
-STATIC int
-xfs_bdstrat_cb(
-	struct xfs_buf	*bp)
-{
-	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-		trace_xfs_bdstrat_shut(bp, _RET_IP_);
-		/*
-		 * Metadata write that didn't get logged but
-		 * written delayed anyway. These aren't associated
-		 * with a transaction, and can be ignored.
-		 */
-		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
-			return xfs_bioerror_relse(bp);
-		else
-			return xfs_bioerror(bp);
-	}
-
-	xfs_buf_iorequest(bp);
-	return 0;
-}
-
 int
 xfs_bwrite(
 	struct xfs_buf		*bp)
@@ -1178,7 +1157,20 @@ xfs_bwrite(
 	bp->b_flags |= XBF_WRITE;
 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
 
-	xfs_bdstrat_cb(bp);
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+
+		/*
+		 * Metadata write that didn't get logged but written anyway.
+		 * These aren't associated with a transaction, and can be
+		 * ignored.
+		 */
+		if (!bp->b_iodone)
+			return xfs_bioerror_relse(bp);
+		return xfs_bioerror(bp);
+	}
+
+	xfs_buf_iorequest(bp);
 
 	error = xfs_buf_iowait(bp);
 	if (error) {
@@ -1861,7 +1853,17 @@ __xfs_buf_delwri_submit(
 			xfs_buf_hold(bp);
 		else
 			list_del_init(&bp->b_list);
-		xfs_bdstrat_cb(bp);
+
+		if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+			trace_xfs_bdstrat_shut(bp, _RET_IP_);
+
+			if (!bp->b_iodone)
+				xfs_bioerror_relse(bp);
+			else
+				xfs_bioerror(bp);
+			continue;
+		}
+		xfs_buf_iorequest(bp);
 	}
 	blk_finish_plug(&plug);
 
-- 
cgit v0.10.2


From 2718775469a521c8b35442db5d665ac0c8c3c8ac Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:04:56 +1000
Subject: xfs: xfs_bioerror can die.

Internal buffer write error handling is a mess due to the unnatural
split between xfs_bioerror and xfs_bioerror_relse().

xfs_bwrite() only does sync IO and determines the handler to
call based on b_iodone, so for this caller the only difference
between xfs_bioerror() and xfs_bioerror_release() is the XBF_DONE
flag. We don't care what the XBF_DONE flag state is because we stale
the buffer in both paths - the next buffer lookup will clear
XBF_DONE because XBF_STALE is set. Hence we can use common
error handling for xfs_bwrite().

__xfs_buf_delwri_submit() is a similar - it's only ever called
on writes - all sync or async - and again there's no reason to
handle them any differently at all.

Clean up the nasty error handling and remove xfs_bioerror().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 0eee0f1..409a8a0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1075,39 +1075,6 @@ xfs_buf_ioerror_alert(
 }
 
 /*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
-	xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
-	/*
-	 * No need to wait until the buffer is unpinned, we aren't flushing it.
-	 */
-	xfs_buf_ioerror(bp, -EIO);
-
-	/*
-	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For
-	 * sync IO, xfs_buf_ioend is going to remove a ref here.
-	 */
-	if (!(bp->b_flags & XBF_ASYNC))
-		xfs_buf_hold(bp);
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDONE(bp);
-	xfs_buf_stale(bp);
-
-	xfs_buf_ioend(bp);
-
-	return -EIO;
-}
-
-/*
  * Same as xfs_bioerror, except that we are releasing the buffer
  * here ourselves, and avoiding the xfs_buf_ioend call.
  * This is meant for userdata errors; metadata bufs come with
@@ -1155,19 +1122,19 @@ xfs_bwrite(
 	ASSERT(xfs_buf_islocked(bp));
 
 	bp->b_flags |= XBF_WRITE;
-	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+			 XBF_WRITE_FAIL | XBF_DONE);
 
 	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 		trace_xfs_bdstrat_shut(bp, _RET_IP_);
 
-		/*
-		 * Metadata write that didn't get logged but written anyway.
-		 * These aren't associated with a transaction, and can be
-		 * ignored.
-		 */
-		if (!bp->b_iodone)
-			return xfs_bioerror_relse(bp);
-		return xfs_bioerror(bp);
+		xfs_buf_ioerror(bp, -EIO);
+		xfs_buf_stale(bp);
+
+		/* sync IO, xfs_buf_ioend is going to remove a ref here */
+		xfs_buf_hold(bp);
+		xfs_buf_ioend(bp);
+		return -EIO;
 	}
 
 	xfs_buf_iorequest(bp);
@@ -1857,10 +1824,9 @@ __xfs_buf_delwri_submit(
 		if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 			trace_xfs_bdstrat_shut(bp, _RET_IP_);
 
-			if (!bp->b_iodone)
-				xfs_bioerror_relse(bp);
-			else
-				xfs_bioerror(bp);
+			xfs_buf_ioerror(bp, -EIO);
+			xfs_buf_stale(bp);
+			xfs_buf_ioend(bp);
 			continue;
 		}
 		xfs_buf_iorequest(bp);
-- 
cgit v0.10.2


From 8b131973d1628f1a0c5a36fe02269d696bbe60a3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:05:05 +1000
Subject: xfs: kill xfs_bioerror_relse

There is only one caller now - xfs_trans_read_buf_map() - and it has
very well defined call semantics - read, synchronous, and b_iodone
is NULL. Hence it's pretty clear what error handling is necessary
for this case. The bigger problem of untangling
xfs_trans_read_buf_map error handling is left to a future patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 409a8a0..108eba7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1074,45 +1074,6 @@ xfs_buf_ioerror_alert(
 		(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
 
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-	struct xfs_buf	*bp)
-{
-	int64_t		fl = bp->b_flags;
-	/*
-	 * No need to wait until the buffer is unpinned.
-	 * We aren't flushing it.
-	 *
-	 * chunkhold expects B_DONE to be set, whether
-	 * we actually finish the I/O or not. We don't want to
-	 * change that interface.
-	 */
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_DONE(bp);
-	xfs_buf_stale(bp);
-	bp->b_iodone = NULL;
-	if (!(fl & XBF_ASYNC)) {
-		/*
-		 * Mark b_error and B_ERROR _both_.
-		 * Lot's of chunkcache code assumes that.
-		 * There's no reason to mark error for
-		 * ASYNC buffers.
-		 */
-		xfs_buf_ioerror(bp, -EIO);
-		complete(&bp->b_iowait);
-	} else {
-		xfs_buf_relse(bp);
-	}
-
-	return -EIO;
-}
-
 int
 xfs_bwrite(
 	struct xfs_buf		*bp)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 44db8cd..d8f57f6 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -297,8 +297,6 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 #define xfs_buf_zero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
-extern int xfs_bioerror_relse(struct xfs_buf *);
-
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 96c898e..db4be5b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -324,11 +324,14 @@ xfs_trans_read_buf_map(
 			 */
 			if (XFS_FORCED_SHUTDOWN(mp)) {
 				trace_xfs_bdstrat_shut(bp, _RET_IP_);
-				xfs_bioerror_relse(bp);
-			} else {
-				xfs_buf_iorequest(bp);
+				bp->b_flags &= ~(XBF_READ | XBF_DONE);
+				xfs_buf_ioerror(bp, -EIO);
+				xfs_buf_stale(bp);
+				xfs_buf_relse(bp);
+				return -EIO;
 			}
 
+			xfs_buf_iorequest(bp);
 			error = xfs_buf_iowait(bp);
 			if (error) {
 				xfs_buf_ioerror_alert(bp, __func__);
-- 
cgit v0.10.2


From 595bff75dce51e0d6d94877b4b6d11b4747a63fd Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:05:14 +1000
Subject: xfs: introduce xfs_buf_submit[_wait]

There is a lot of cookie-cutter code that looks like:

	if (shutdown)
		handle buffer error
	xfs_buf_iorequest(bp)
	error = xfs_buf_iowait(bp)
	if (error)
		handle buffer error

spread through XFS. There's significant complexity now in
xfs_buf_iorequest() to specifically handle this sort of synchronous
IO pattern, but there's all sorts of nasty surprises in different
error handling code dependent on who owns the buffer references and
the locks.

Pull this pattern into a single helper, where we can hide all the
synchronous IO warts and hence make the error handling for all the
callers much saner. This removes the need for a special extra
reference to protect IO completion processing, as we can now hold a
single reference across dispatch and waiting, simplifying the sync
IO smeantics and error handling.

In doing this, also rename xfs_buf_iorequest to xfs_buf_submit and
make it explicitly handle on asynchronous IO. This forces all users
to be switched specifically to one interface or the other and
removes any ambiguity between how the interfaces are to be used. It
also means that xfs_buf_iowait() goes away.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d..c53cc03 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1157,12 +1157,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			error = -EIO;
-			break;
-		}
-		xfs_buf_iorequest(bp);
-		error = xfs_buf_iowait(bp);
+		error = xfs_buf_submit_wait(bp);
 		if (error) {
 			xfs_buf_ioerror_alert(bp,
 					"xfs_zero_remaining_bytes(read)");
@@ -1175,12 +1170,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
 
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			error = -EIO;
-			break;
-		}
-		xfs_buf_iorequest(bp);
-		error = xfs_buf_iowait(bp);
+		error = xfs_buf_submit_wait(bp);
 		if (error) {
 			xfs_buf_ioerror_alert(bp,
 					"xfs_zero_remaining_bytes(write)");
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 108eba7..d99ec83 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -623,10 +623,11 @@ _xfs_buf_read(
 	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
 	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
 
-	xfs_buf_iorequest(bp);
-	if (flags & XBF_ASYNC)
+	if (flags & XBF_ASYNC) {
+		xfs_buf_submit(bp);
 		return 0;
-	return xfs_buf_iowait(bp);
+	}
+	return xfs_buf_submit_wait(bp);
 }
 
 xfs_buf_t *
@@ -708,12 +709,7 @@ xfs_buf_read_uncached(
 	bp->b_flags |= XBF_READ;
 	bp->b_ops = ops;
 
-	if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
-		xfs_buf_relse(bp);
-		return NULL;
-	}
-	xfs_buf_iorequest(bp);
-	xfs_buf_iowait(bp);
+	xfs_buf_submit_wait(bp);
 	return bp;
 }
 
@@ -1028,12 +1024,8 @@ xfs_buf_ioend(
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
-	else {
+	else
 		complete(&bp->b_iowait);
-
-		/* release the !XBF_ASYNC ref now we are done. */
-		xfs_buf_rele(bp);
-	}
 }
 
 static void
@@ -1086,21 +1078,7 @@ xfs_bwrite(
 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
 			 XBF_WRITE_FAIL | XBF_DONE);
 
-	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-		trace_xfs_bdstrat_shut(bp, _RET_IP_);
-
-		xfs_buf_ioerror(bp, -EIO);
-		xfs_buf_stale(bp);
-
-		/* sync IO, xfs_buf_ioend is going to remove a ref here */
-		xfs_buf_hold(bp);
-		xfs_buf_ioend(bp);
-		return -EIO;
-	}
-
-	xfs_buf_iorequest(bp);
-
-	error = xfs_buf_iowait(bp);
+	error = xfs_buf_submit_wait(bp);
 	if (error) {
 		xfs_force_shutdown(bp->b_target->bt_mount,
 				   SHUTDOWN_META_IO_ERROR);
@@ -1209,7 +1187,7 @@ next_chunk:
 	} else {
 		/*
 		 * This is guaranteed not to be the last io reference count
-		 * because the caller (xfs_buf_iorequest) holds a count itself.
+		 * because the caller (xfs_buf_submit) holds a count itself.
 		 */
 		atomic_dec(&bp->b_io_remaining);
 		xfs_buf_ioerror(bp, -EIO);
@@ -1299,13 +1277,29 @@ _xfs_buf_ioapply(
 	blk_finish_plug(&plug);
 }
 
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
 void
-xfs_buf_iorequest(
-	xfs_buf_t		*bp)
+xfs_buf_submit(
+	struct xfs_buf	*bp)
 {
-	trace_xfs_buf_iorequest(bp, _RET_IP_);
+	trace_xfs_buf_submit(bp, _RET_IP_);
 
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+	ASSERT(bp->b_flags & XBF_ASYNC);
+
+	/* on shutdown we stale and complete the buffer immediately */
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		xfs_buf_ioerror(bp, -EIO);
+		bp->b_flags &= ~XBF_DONE;
+		xfs_buf_stale(bp);
+		xfs_buf_ioend(bp);
+		return;
+	}
 
 	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
@@ -1314,25 +1308,14 @@ xfs_buf_iorequest(
 	bp->b_io_error = 0;
 
 	/*
-	 * Take references to the buffer. For XBF_ASYNC buffers, holding a
-	 * reference for as long as submission takes is all that is necessary
-	 * here. The IO inherits the lock and hold count from the submitter,
-	 * and these are release during IO completion processing. Taking a hold
-	 * over submission ensures that the buffer is not freed until we have
-	 * completed all processing, regardless of when IO errors occur or are
-	 * reported.
-	 *
-	 * However, for synchronous IO, the IO does not inherit the submitters
-	 * reference count, nor the buffer lock. Hence we need to take an extra
-	 * reference to the buffer for the for the IO context so that we can
-	 * guarantee the buffer is not freed until all IO completion processing
-	 * is done. Otherwise the caller can drop their reference while the IO
-	 * is still in progress and hence trigger a use-after-free situation.
+	 * The caller's reference is released during I/O completion.
+	 * This occurs some time after the last b_io_remaining reference is
+	 * released, so after we drop our Io reference we have to have some
+	 * other reference to ensure the buffer doesn't go away from underneath
+	 * us. Take a direct reference to ensure we have safe access to the
+	 * buffer until we are finished with it.
 	 */
 	xfs_buf_hold(bp);
-	if (!(bp->b_flags & XBF_ASYNC))
-		xfs_buf_hold(bp);
-
 
 	/*
 	 * Set the count to 1 initially, this will stop an I/O completion
@@ -1343,40 +1326,82 @@ xfs_buf_iorequest(
 	_xfs_buf_ioapply(bp);
 
 	/*
-	 * If _xfs_buf_ioapply failed or we are doing synchronous IO that
-	 * completes extremely quickly, we can get back here with only the IO
-	 * reference we took above. If we drop it to zero, run completion
-	 * processing synchronously so that we don't return to the caller with
-	 * completion still pending. This avoids unnecessary context switches
-	 * associated with the end_io workqueue.
+	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
+	 * reference we took above. If we drop it to zero, run completion so
+	 * that we don't return to the caller with completion still pending.
 	 */
 	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
-		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+		if (bp->b_error)
 			xfs_buf_ioend(bp);
 		else
 			xfs_buf_ioend_async(bp);
 	}
 
 	xfs_buf_rele(bp);
+	/* Note: it is not safe to reference bp now we've dropped our ref */
 }
 
 /*
- * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer, in which
- * case nothing will ever complete.  It returns the I/O error code, if any, or
- * 0 if there was no error.
+ * Synchronous buffer IO submission path, read or write.
  */
 int
-xfs_buf_iowait(
-	xfs_buf_t		*bp)
+xfs_buf_submit_wait(
+	struct xfs_buf	*bp)
 {
-	trace_xfs_buf_iowait(bp, _RET_IP_);
+	int		error;
 
-	if (!bp->b_error)
-		wait_for_completion(&bp->b_iowait);
+	trace_xfs_buf_submit_wait(bp, _RET_IP_);
+
+	ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
 
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		xfs_buf_ioerror(bp, -EIO);
+		xfs_buf_stale(bp);
+		bp->b_flags &= ~XBF_DONE;
+		return -EIO;
+	}
+
+	if (bp->b_flags & XBF_WRITE)
+		xfs_buf_wait_unpin(bp);
+
+	/* clear the internal error state to avoid spurious errors */
+	bp->b_io_error = 0;
+
+	/*
+	 * For synchronous IO, the IO does not inherit the submitters reference
+	 * count, nor the buffer lock. Hence we cannot release the reference we
+	 * are about to take until we've waited for all IO completion to occur,
+	 * including any xfs_buf_ioend_async() work that may be pending.
+	 */
+	xfs_buf_hold(bp);
+
+	/*
+	 * Set the count to 1 initially, this will stop an I/O completion
+	 * callout which happens before we have started all the I/O from calling
+	 * xfs_buf_ioend too early.
+	 */
+	atomic_set(&bp->b_io_remaining, 1);
+	_xfs_buf_ioapply(bp);
+
+	/*
+	 * make sure we run completion synchronously if it raced with us and is
+	 * already complete.
+	 */
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend(bp);
+
+	/* wait for completion before gathering the error from the buffer */
+	trace_xfs_buf_iowait(bp, _RET_IP_);
+	wait_for_completion(&bp->b_iowait);
 	trace_xfs_buf_iowait_done(bp, _RET_IP_);
-	return bp->b_error;
+	error = bp->b_error;
+
+	/*
+	 * all done now, we can release the hold that keeps the buffer
+	 * referenced for the entire IO.
+	 */
+	xfs_buf_rele(bp);
+	return error;
 }
 
 xfs_caddr_t
@@ -1782,15 +1807,7 @@ __xfs_buf_delwri_submit(
 		else
 			list_del_init(&bp->b_list);
 
-		if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-			trace_xfs_bdstrat_shut(bp, _RET_IP_);
-
-			xfs_buf_ioerror(bp, -EIO);
-			xfs_buf_stale(bp);
-			xfs_buf_ioend(bp);
-			continue;
-		}
-		xfs_buf_iorequest(bp);
+		xfs_buf_submit(bp);
 	}
 	blk_finish_plug(&plug);
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d8f57f6..0acfc30 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -290,8 +290,8 @@ extern int xfs_bwrite(struct xfs_buf *bp);
 extern void xfs_buf_ioend(struct xfs_buf *bp);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern void xfs_buf_iorequest(xfs_buf_t *);
-extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_submit(struct xfs_buf *bp);
+extern int xfs_buf_submit_wait(struct xfs_buf *bp);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 				xfs_buf_rw_t);
 #define xfs_buf_zero(bp, off, len) \
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4fd41b5..cbea909 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks(
 	 * a way to shut the filesystem down if the writes keep failing.
 	 *
 	 * In practice we'll shut the filesystem down soon as non-transient
-	 * erorrs tend to affect the whole device and a failing log write
+	 * errors tend to affect the whole device and a failing log write
 	 * will make us give up.  But we really ought to do better here.
 	 */
 	if (XFS_BUF_ISASYNC(bp)) {
@@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks(
 		if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
 			bp->b_flags |= XBF_WRITE | XBF_ASYNC |
 				       XBF_DONE | XBF_WRITE_FAIL;
-			xfs_buf_iorequest(bp);
+			xfs_buf_submit(bp);
 		} else {
 			xfs_buf_relse(bp);
 		}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3567396..fe88ef6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1688,7 +1688,7 @@ xlog_bdstrat(
 		return 0;
 	}
 
-	xfs_buf_iorequest(bp);
+	xfs_buf_submit(bp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4ba19bf..980e296 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,12 +193,8 @@ xlog_bread_noalign(
 	bp->b_io_length = nbblks;
 	bp->b_error = 0;
 
-	if (XFS_FORCED_SHUTDOWN(log->l_mp))
-		return -EIO;
-
-	xfs_buf_iorequest(bp);
-	error = xfs_buf_iowait(bp);
-	if (error)
+	error = xfs_buf_submit_wait(bp);
+	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
 		xfs_buf_ioerror_alert(bp, __func__);
 	return error;
 }
@@ -378,9 +374,11 @@ xlog_recover_iodone(
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
-		xfs_buf_ioerror_alert(bp, __func__);
-		xfs_force_shutdown(bp->b_target->bt_mount,
-					SHUTDOWN_META_IO_ERROR);
+		if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			xfs_force_shutdown(bp->b_target->bt_mount,
+						SHUTDOWN_META_IO_ERROR);
+		}
 	}
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp);
@@ -4427,16 +4425,12 @@ xlog_do_recover(
 	XFS_BUF_UNASYNC(bp);
 	bp->b_ops = &xfs_sb_buf_ops;
 
-	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
-		xfs_buf_relse(bp);
-		return -EIO;
-	}
-
-	xfs_buf_iorequest(bp);
-	error = xfs_buf_iowait(bp);
+	error = xfs_buf_submit_wait(bp);
 	if (error) {
-		xfs_buf_ioerror_alert(bp, __func__);
-		ASSERT(0);
+		if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			ASSERT(0);
+		}
 		xfs_buf_relse(bp);
 		return error;
 	}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f827..51372e3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
-DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index db4be5b..e2b2216 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -318,23 +318,10 @@ xfs_trans_read_buf_map(
 			XFS_BUF_READ(bp);
 			bp->b_ops = ops;
 
-			/*
-			 * XXX(hch): clean up the error handling here to be less
-			 * of a mess..
-			 */
-			if (XFS_FORCED_SHUTDOWN(mp)) {
-				trace_xfs_bdstrat_shut(bp, _RET_IP_);
-				bp->b_flags &= ~(XBF_READ | XBF_DONE);
-				xfs_buf_ioerror(bp, -EIO);
-				xfs_buf_stale(bp);
-				xfs_buf_relse(bp);
-				return -EIO;
-			}
-
-			xfs_buf_iorequest(bp);
-			error = xfs_buf_iowait(bp);
+			error = xfs_buf_submit_wait(bp);
 			if (error) {
-				xfs_buf_ioerror_alert(bp, __func__);
+				if (!XFS_FORCED_SHUTDOWN(mp))
+					xfs_buf_ioerror_alert(bp, __func__);
 				xfs_buf_relse(bp);
 				/*
 				 * We can gracefully recover from most read
-- 
cgit v0.10.2


From ba3726742c1712c43c5a18245476f3fe9fe74773 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:05:32 +1000
Subject: xfs: check xfs_buf_read_uncached returns correctly

xfs_buf_read_uncached() has two failure modes. If can either return
NULL or bp->b_error != 0 depending on the type of failure, and not
all callers check for both. Fix it so that xfs_buf_read_uncached()
always returns the error status, and the buffer is returned as a
function parameter. The buffer will only be returned on success.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d99ec83..6fbcbbf 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -688,29 +688,39 @@ xfs_buf_readahead_map(
  * Read an uncached buffer from disk. Allocates and returns a locked
  * buffer containing the disk contents or nothing.
  */
-struct xfs_buf *
+int
 xfs_buf_read_uncached(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		daddr,
 	size_t			numblks,
 	int			flags,
+	struct xfs_buf		**bpp,
 	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 
+	*bpp = NULL;
+
 	bp = xfs_buf_get_uncached(target, numblks, flags);
 	if (!bp)
-		return NULL;
+		return -ENOMEM;
 
 	/* set up the buffer for a read IO */
 	ASSERT(bp->b_map_count == 1);
-	bp->b_bn = daddr;
+	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
 	bp->b_maps[0].bm_bn = daddr;
 	bp->b_flags |= XBF_READ;
 	bp->b_ops = ops;
 
 	xfs_buf_submit_wait(bp);
-	return bp;
+	if (bp->b_error) {
+		int	error = bp->b_error;
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	*bpp = bp;
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 0acfc30..82002c0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -269,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
 				int flags);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
-				xfs_daddr_t daddr, size_t numblks, int flags,
-				const struct xfs_buf_ops *ops);
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
+			  size_t numblks, int flags, struct xfs_buf **bpp,
+			  const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f91de1e..c05ac8b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -172,16 +172,11 @@ xfs_growfs_data_private(
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
 	dpct = pct - mp->m_sb.sb_imax_pct;
-	bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+	error = xfs_buf_read_uncached(mp->m_ddev_targp,
 				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-				XFS_FSS_TO_BB(mp, 1), 0, NULL);
-	if (!bp)
-		return -EIO;
-	if (bp->b_error) {
-		error = bp->b_error;
-		xfs_buf_relse(bp);
+				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error)
 		return error;
-	}
 	xfs_buf_relse(bp);
 
 	new = nb;	/* use new as a temporary here */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fbf0384..142c460 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -302,21 +302,15 @@ xfs_readsb(
 	 * access to the superblock.
 	 */
 reread:
-	bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-				   BTOBB(sector_size), 0, buf_ops);
-	if (!bp) {
-		if (loud)
-			xfs_warn(mp, "SB buffer read failed");
-		return -EIO;
-	}
-	if (bp->b_error) {
-		error = bp->b_error;
+	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+				   BTOBB(sector_size), 0, &bp, buf_ops);
+	if (error) {
 		if (loud)
 			xfs_warn(mp, "SB validate failed with error %d.", error);
 		/* bad CRC means corrupted metadata */
 		if (error == -EFSBADCRC)
 			error = -EFSCORRUPTED;
-		goto release_buf;
+		return error;
 	}
 
 	/*
@@ -546,40 +540,43 @@ xfs_set_inoalignment(xfs_mount_t *mp)
  * Check that the data (and log if separate) is an ok size.
  */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp)
+xfs_check_sizes(
+	struct xfs_mount *mp)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf	*bp;
 	xfs_daddr_t	d;
+	int		error;
 
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
 		xfs_warn(mp, "filesystem size mismatch detected");
 		return -EFBIG;
 	}
-	bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+	error = xfs_buf_read_uncached(mp->m_ddev_targp,
 					d - XFS_FSS_TO_BB(mp, 1),
-					XFS_FSS_TO_BB(mp, 1), 0, NULL);
-	if (!bp) {
+					XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
 		xfs_warn(mp, "last sector read failed");
-		return -EIO;
+		return error;
 	}
 	xfs_buf_relse(bp);
 
-	if (mp->m_logdev_targp != mp->m_ddev_targp) {
-		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
-		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-			xfs_warn(mp, "log size mismatch detected");
-			return -EFBIG;
-		}
-		bp = xfs_buf_read_uncached(mp->m_logdev_targp,
+	if (mp->m_logdev_targp == mp->m_ddev_targp)
+		return 0;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+		xfs_warn(mp, "log size mismatch detected");
+		return -EFBIG;
+	}
+	error = xfs_buf_read_uncached(mp->m_logdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
-					XFS_FSB_TO_BB(mp, 1), 0, NULL);
-		if (!bp) {
-			xfs_warn(mp, "log device read failed");
-			return -EIO;
-		}
-		xfs_buf_relse(bp);
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
+		xfs_warn(mp, "log device read failed");
+		return error;
 	}
+	xfs_buf_relse(bp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 909e143..1ad0093 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -972,16 +972,11 @@ xfs_growfs_rt(
 	/*
 	 * Read in the last block of the device, make sure it exists.
 	 */
-	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
 				XFS_FSB_TO_BB(mp, nrblocks - 1),
-				XFS_FSB_TO_BB(mp, 1), 0, NULL);
-	if (!bp)
-		return -EIO;
-	if (bp->b_error) {
-		error = bp->b_error;
-		xfs_buf_relse(bp);
+				XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error)
 		return error;
-	}
 	xfs_buf_relse(bp);
 
 	/*
@@ -1235,11 +1230,12 @@ xfs_rtallocate_extent(
  */
 int				/* error */
 xfs_rtmount_init(
-	xfs_mount_t	*mp)	/* file system mount structure */
+	struct xfs_mount	*mp)	/* file system mount structure */
 {
-	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
-	xfs_daddr_t	d;	/* address of last block of subvolume */
-	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
+	struct xfs_buf		*bp;	/* buffer for last block of subvolume */
+	struct xfs_sb		*sbp;	/* filesystem superblock copy in mount */
+	xfs_daddr_t		d;	/* address of last block of subvolume */
+	int			error;
 
 	sbp = &mp->m_sb;
 	if (sbp->sb_rblocks == 0)
@@ -1265,14 +1261,12 @@ xfs_rtmount_init(
 			(unsigned long long) mp->m_sb.sb_rblocks);
 		return -EFBIG;
 	}
-	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+	error = xfs_buf_read_uncached(mp->m_rtdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
-					XFS_FSB_TO_BB(mp, 1), 0, NULL);
-	if (!bp || bp->b_error) {
+					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+	if (error) {
 		xfs_warn(mp, "realtime device size check failed");
-		if (bp)
-			xfs_buf_relse(bp);
-		return -EIO;
+		return error;
 	}
 	xfs_buf_relse(bp);
 	return 0;
-- 
cgit v0.10.2


From 8c15612546bce1ecafb7dee3cce8a2a9b560e15e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 2 Oct 2014 09:05:44 +1000
Subject: xfs: simplify xfs_zero_remaining_bytes

xfs_zero_remaining_bytes() open codes a log of buffer manupulations
to do a read forllowed by a write. It can simply be replaced by an
uncached read followed by a xfs_bwrite() call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c53cc03..aa70620 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes(
 	if (endoff > XFS_ISIZE(ip))
 		endoff = XFS_ISIZE(ip);
 
-	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-					mp->m_rtdev_targp : mp->m_ddev_targp,
-				  BTOBB(mp->m_sb.sb_blocksize), 0);
-	if (!bp)
-		return -ENOMEM;
-
-	xfs_buf_unlock(bp);
-
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
 		uint lock_mode;
 
@@ -1152,32 +1144,24 @@ xfs_zero_remaining_bytes(
 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
 		if (imap.br_state == XFS_EXT_UNWRITTEN)
 			continue;
-		XFS_BUF_UNDONE(bp);
-		XFS_BUF_UNWRITE(bp);
-		XFS_BUF_READ(bp);
-		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 
-		error = xfs_buf_submit_wait(bp);
-		if (error) {
-			xfs_buf_ioerror_alert(bp,
-					"xfs_zero_remaining_bytes(read)");
-			break;
-		}
+		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp,
+				xfs_fsb_to_db(ip, imap.br_startblock),
+				BTOBB(mp->m_sb.sb_blocksize),
+				0, &bp, NULL);
+		if (error)
+			return error;
+
 		memset(bp->b_addr +
-			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-		      0, lastoffset - offset + 1);
-		XFS_BUF_UNDONE(bp);
-		XFS_BUF_UNREAD(bp);
-		XFS_BUF_WRITE(bp);
+				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+		       0, lastoffset - offset + 1);
 
-		error = xfs_buf_submit_wait(bp);
-		if (error) {
-			xfs_buf_ioerror_alert(bp,
-					"xfs_zero_remaining_bytes(write)");
-			break;
-		}
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
 	}
-	xfs_buf_free(bp);
 	return error;
 }
 
-- 
cgit v0.10.2


From b1d6cc02f2f6a590c4d8dc2c3bcf7be3b9419945 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:17:58 +1000
Subject: xfs: compat_xfs_bstat does not have forkoff

struct compat_xfs_bstat is missing the di_forkoff field and so does
not fully translate the structure correctly. Fix it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a554646..94ce027 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -160,6 +160,7 @@ xfs_ioctl32_bstat_copyin(
 	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
 	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
 	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+	    get_user(bstat->bs_forkoff,	&bstat32->bs_forkoff)	||
 	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
 	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
 	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -214,6 +215,7 @@ xfs_bulkstat_one_fmt_compat(
 	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
 	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
 	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
+	    put_user(buffer->bs_forkoff,  &p32->bs_forkoff)	||
 	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
 	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
 	    put_user(buffer->bs_aextents, &p32->bs_aextents))
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060..b1bb454 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat {
 	__u32		bs_gen;		/* generation count		*/
 	__u16		bs_projid_lo;	/* lower part of project id	*/
 #define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
 	__u16		bs_projid_hi;	/* high part of project id	*/
-	unsigned char	bs_pad[12];	/* pad space, unused		*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
-- 
cgit v0.10.2


From e076b0f3a5c472e77c0a0e163188f2761e8b4fed Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:18:13 +1000
Subject: xfs: kill time.h

The typedef for timespecs and nanotime() are completely unnecessary,
and delay() can be moved to fs/xfs/linux.h, which means this file
can go away.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 844e288..53e95b2 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -21,7 +21,6 @@
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
-#include "time.h"
 #include "kmem.h"
 #include "xfs_message.h"
 
diff --git a/fs/xfs/time.h b/fs/xfs/time.h
deleted file mode 100644
index 387e695..0000000
--- a/fs/xfs/time.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-
-#include <linux/sched.h>
-#include <linux/time.h>
-
-typedef struct timespec timespec_t;
-
-static inline void delay(long ticks)
-{
-	schedule_timeout_uninterruptible(ticks);
-}
-
-static inline void nanotime(struct timespec *tvp)
-{
-	*tvp = CURRENT_TIME;
-}
-
-#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c92cb48..4c130ff 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -654,7 +654,7 @@ xfs_ialloc(
 	xfs_inode_t	*ip;
 	uint		flags;
 	int		error;
-	timespec_t	tv;
+	struct timespec	tv;
 
 	/*
 	 * Call the space management code to pick
@@ -720,7 +720,7 @@ xfs_ialloc(
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 
-	nanotime(&tv);
+	tv = current_fs_time(mp->m_super);
 	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 	ip->i_d.di_atime = ip->i_d.di_mtime;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index d10dc8f..6a51619 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -56,7 +56,6 @@ typedef __uint64_t __psunsigned_t;
 
 #include "kmem.h"
 #include "mrlock.h"
-#include "time.h"
 #include "uuid.h"
 
 #include <linux/semaphore.h>
@@ -179,6 +178,11 @@ typedef __uint64_t __psunsigned_t;
 #define MAX(a,b)	(max(a,b))
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 
+static inline void delay(long ticks)
+{
+	schedule_timeout_uninterruptible(ticks);
+}
+
 /*
  * XFS wrapper structure for sysfs support. It depends on external data
  * structures and is embedded in various internal data structures to implement
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 50c3f56..cdb4d86 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -70,7 +70,7 @@ xfs_trans_ichgtime(
 	int			flags)
 {
 	struct inode		*inode = VFS_I(ip);
-	timespec_t		tv;
+	struct timespec		tv;
 
 	ASSERT(tp);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-- 
cgit v0.10.2


From 9336e3a765b68d4a7fdd8256f393ebce95ecb0a7 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:18:40 +1000
Subject: xfs: project id inheritance is a directory only flag

xfs_set_diflags() allows it to be set on non-directory inodes, and
this flags errors in xfs_repair. Further, inode allocation allows
the same directory-only flag to be inherited to non-directories.
Make sure directory inode flags don't appear on other types of
inodes.

This fixes several xfstests scratch fileystem corruption reports
(e.g. xfs/050) now that xfstests checks scratch filesystems after
test completion.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4c130ff..2f63742 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -769,6 +769,8 @@ xfs_ialloc(
 					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 					ip->i_d.di_extsize = pip->i_d.di_extsize;
 				}
+				if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+					di_flags |= XFS_DIFLAG_PROJINHERIT;
 			} else if (S_ISREG(mode)) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_REALTIME;
@@ -789,8 +791,6 @@ xfs_ialloc(
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 			    xfs_inherit_nosymlinks)
 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
-			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-				di_flags |= XFS_DIFLAG_PROJINHERIT;
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 			    xfs_inherit_nodefrag)
 				di_flags |= XFS_DIFLAG_NODEFRAG;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3799695..05a1955 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -968,8 +968,6 @@ xfs_set_diflags(
 		di_flags |= XFS_DIFLAG_NOATIME;
 	if (xflags & XFS_XFLAG_NODUMP)
 		di_flags |= XFS_DIFLAG_NODUMP;
-	if (xflags & XFS_XFLAG_PROJINHERIT)
-		di_flags |= XFS_DIFLAG_PROJINHERIT;
 	if (xflags & XFS_XFLAG_NODEFRAG)
 		di_flags |= XFS_DIFLAG_NODEFRAG;
 	if (xflags & XFS_XFLAG_FILESTREAM)
@@ -981,6 +979,8 @@ xfs_set_diflags(
 			di_flags |= XFS_DIFLAG_NOSYMLINKS;
 		if (xflags & XFS_XFLAG_EXTSZINHERIT)
 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+		if (xflags & XFS_XFLAG_PROJINHERIT)
+			di_flags |= XFS_DIFLAG_PROJINHERIT;
 	} else if (S_ISREG(ip->i_d.di_mode)) {
 		if (xflags & XFS_XFLAG_REALTIME)
 			di_flags |= XFS_DIFLAG_REALTIME;
-- 
cgit v0.10.2


From a872703f34cd6033d0b174fa598f63f1a57145bb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 2 Oct 2014 09:20:30 +1000
Subject: xfs: only set extent size hint when asked

Currently the extent size hint is set unconditionally in
xfs_ioctl_setattr() when the FSX_EXTSIZE flag is set. Hence we can
set hints when the inode flags indicating the hint should be used
are not set.  Hence only set the extent size hint from userspace
when the inode has the XFS_DIFLAG_EXTSIZE flag set to indicate that
we should have an extent size hint set on the inode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 05a1955..d6afc9f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1231,13 +1231,25 @@ xfs_ioctl_setattr(
 
 	}
 
-	if (mask & FSX_EXTSIZE)
-		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
 	if (mask & FSX_XFLAGS) {
 		xfs_set_diflags(ip, fa->fsx_xflags);
 		xfs_diflags_to_linux(ip);
 	}
 
+	/*
+	 * Only set the extent size hint if we've already determined that the
+	 * extent size hint should be set on the inode. If no extent size flags
+	 * are set on the inode then unconditionally clear the extent size hint.
+	 */
+	if (mask & FSX_EXTSIZE) {
+		int	extsize = 0;
+
+		if (ip->i_d.di_flags &
+				(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+			extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+		ip->i_d.di_extsize = extsize;
+	}
+
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-- 
cgit v0.10.2


From ce57bcf6b81caf1e9f780e98e8d23d3555746d74 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 2 Oct 2014 09:21:53 +1000
Subject: xfs: check for inode size overflow in xfs_new_eof()

If we write to the maximum file offset (2^63-2), XFS fails to log the
inode size update when the page is flushed. For example:

$ xfs_io -fc "pwrite `echo "2^63-1-1" | bc` 1" /mnt/file
wrote 1/1 bytes at offset 9223372036854775806
1.000000 bytes, 1 ops; 0.0000 sec (22.711 KiB/sec and 23255.8140 ops/sec)
$ stat -c %s /mnt/file
9223372036854775807
$ umount /mnt ; mount <dev> /mnt/
$ stat -c %s /mnt/file
0

This occurs because XFS calculates the new file size as io_offset +
io_size, I/O occurs in block sized requests, and the maximum supported
file size is not block aligned. Therefore, a write to the max allowable
offset on a 4k blocksize fs results in a write of size 4k to offset
2^63-4096 (e.g., equivalent to round_down(2^63-1, 4096), or IOW the
offset of the block that contains the max file size). The offset plus
size calculation (2^63 - 4096 + 4096 == 2^63) overflows the signed
64-bit variable which goes negative and causes the > comparison to the
on-disk inode size to fail. This returns 0 from xfs_new_eof() and
results in no change to the inode on-disk.

Update xfs_new_eof() to explicitly detect overflow of the local
calculation and use the VFS inode size in this scenario. The VFS inode
size is capped to the maximum and thus XFS writes the correct inode size
to disk.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c10e3fa..9af2882 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 {
 	xfs_fsize_t i_size = i_size_read(VFS_I(ip));
 
-	if (new_size > i_size)
+	if (new_size > i_size || new_size < 0)
 		new_size = i_size;
 	return new_size > ip->i_d.di_size ? new_size : 0;
 }
-- 
cgit v0.10.2


From 6ee49a20c13b4b4e79a3bba406df8106cff284a1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Oct 2014 09:23:49 +1000
Subject: xfs: don't send null bp to xfs_trans_brelse()

In this case, if bp is NULL, error is set, and we send a
NULL bp to xfs_trans_brelse, which will try to dereference it.

Test whether we actually have a buffer before we try to
free it.

Coverity spotted this.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 2c42ae2..fd82753 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2563,7 +2563,8 @@ xfs_da_get_buf(
 				    mapp, nmap, 0);
 	error = bp ? bp->b_error : -EIO;
 	if (error) {
-		xfs_trans_brelse(trans, bp);
+		if (bp)
+			xfs_trans_brelse(trans, bp);
 		goto out_free;
 	}
 
-- 
cgit v0.10.2


From 04dd1a0d4b17a71220eae4fb313218f15a49bcdd Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Oct 2014 09:24:11 +1000
Subject: xfs: fix crc field handling in xfs_sb_to/from_disk

I discovered this in userspace, but the same change applies
to the kernel.

If we xfs_mdrestore an image from a non-crc filesystem, lo
and behold the restored image has gained a CRC:

# db/xfs_metadump.sh -o /dev/sdc1 - | xfs_mdrestore - test.img
# xfs_db -c "sb 0" -c "p crc" /dev/sdc1
crc = 0 (correct)
# xfs_db -c "sb 0" -c "p crc" test.img
crc = 0xb6f8d6a0 (correct)

This is because xfs_sb_from_disk doesn't fill in sb_crc,
but xfs_sb_to_disk(XFS_SB_ALL_BITS) does write the in-memory
CRC to disk - so we get uninitialized memory on disk.

Fix this by always initializing sb_crc to 0 when we read
the superblock, and masking out the CRC bit from ALL_BITS
when we write it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8426e5e..5f902fa 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -445,6 +445,8 @@ __xfs_sb_from_disk(
 	to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
 	to->sb_features_log_incompat =
 				be32_to_cpu(from->sb_features_log_incompat);
+	/* crc is only used on disk, not in memory; just init to 0 here. */
+	to->sb_crc = 0;
 	to->sb_pad = 0;
 	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
 	to->sb_lsn = be64_to_cpu(from->sb_lsn);
@@ -550,6 +552,9 @@ xfs_sb_to_disk(
 	if (!fields)
 		return;
 
+	/* We should never write the crc here, it's updated in the IO path */
+	fields &= ~XFS_SB_CRC;
+
 	xfs_sb_quota_to_disk(to, from, &fields);
 	while (fields) {
 		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-- 
cgit v0.10.2


From 5cca3f611d159e5a4a5ec60413bd09948ef40aea Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 2 Oct 2014 09:27:09 +1000
Subject: xfs: check for null dquot in xfs_quota_calc_throttle()

Coverity spotted this.

Granted, we *just* checked xfs_inod_dquot() in the caller (by
calling xfs_quota_need_throttle). However, this is the only place we
don't check the return value but the check is cheap and future-proof
so add it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e9c47b6..afcf3c9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -404,8 +404,8 @@ xfs_quota_calc_throttle(
 	int shift = 0;
 	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
 
-	/* over hi wmark, squash the prealloc completely */
-	if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+	/* no dq, or over hi wmark, squash the prealloc completely */
+	if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
 		*qblocks = 0;
 		*qfreesp = 0;
 		return;
-- 
cgit v0.10.2


From 07d08681d26e99d8ba3bc4e56380f2cc04d3ff3b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 2 Oct 2014 09:42:06 +1000
Subject: xfs: restore buffer_head unwritten bit on ioend cancel

xfs_vm_writepage() walks each buffer_head on the page, maps to the block
on disk and attaches to a running ioend structure that represents the
I/O submission. A new ioend is created when the type of I/O (unwritten,
delayed allocation or overwrite) required for a particular buffer_head
differs from the previous. If a buffer_head is a delalloc or unwritten
buffer, the associated bits are cleared by xfs_map_at_offset() once the
buffer_head is added to the ioend.

The process of mapping each buffer_head occurs in xfs_map_blocks() and
acquires the ilock in blocking or non-blocking mode, depending on the
type of writeback in progress. If the lock cannot be acquired for
non-blocking writeback, we cancel the ioend, redirty the page and
return. Writeback will revisit the page at some later point.

Note that we acquire the ilock for each buffer on the page. Therefore
during non-blocking writeback, it is possible to add an unwritten buffer
to the ioend, clear the unwritten state, fail to acquire the ilock when
mapping a subsequent buffer and cancel the ioend. If this occurs, the
unwritten status of the buffer sitting in the ioend has been lost. The
page will eventually hit writeback again, but xfs_vm_writepage() submits
overwrite I/O instead of unwritten I/O and does not perform unwritten
extent conversion at I/O completion. This leads to data corruption
because unwritten extents are treated as holes on reads and zeroes are
returned instead of reading from disk.

Modify xfs_cancel_ioend() to restore the buffer unwritten bit for ioends
of type XFS_IO_UNWRITTEN. This ensures that unwritten extent conversion
occurs once the page is eventually written back.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4c..dc3e108 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -548,6 +548,13 @@ xfs_cancel_ioend(
 		do {
 			next_bh = bh->b_private;
 			clear_buffer_async_write(bh);
+			/*
+			 * The unwritten flag is cleared when added to the
+			 * ioend. We're not submitting for I/O so mark the
+			 * buffer unwritten again for next time around.
+			 */
+			if (ioend->io_type == XFS_IO_UNWRITTEN)
+				set_buffer_unwritten(bh);
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 
-- 
cgit v0.10.2


From da5f10969d54006a24777a84ed3eaeeb2a21047f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 2 Oct 2014 09:44:54 +1000
Subject: xfs: flush the range before zero range conversion

XFS currently discards delalloc blocks within the target range of a
zero range request. Unaligned start and end offsets are zeroed
through the page cache and the internal, aligned blocks are
converted to unwritten extents.

If EOF is page aligned and covered by a delayed allocation extent.
The inode size is not updated until I/O completion. If a zero range
request discards a delalloc range that covers page aligned EOF as
such, the inode size update never occurs. For example:

$ rm -f /mnt/file
$ xfs_io -fc "pwrite 0 64k" -c "zero 60k 4k" /mnt/file
$ stat -c "%s" /mnt/file
65536
$ umount /mnt
$ mount <dev> /mnt
$ stat -c "%s" /mnt/file
61440

Update xfs_zero_file_space() to flush the range rather than discard
delalloc blocks to ensure that inode size updates occur
appropriately.

[dchinner: Note that this is really a workaround to avoid the
underlying problems. More work is needed (and ongoing) to fix those
issues so this fix is being added as a temporary stop-gap measure. ]

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1cb345e..6f5cb63 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1392,14 +1392,14 @@ xfs_zero_file_space(
 
 	if (start_boundary < end_boundary - 1) {
 		/*
-		 * punch out delayed allocation blocks and the page cache over
-		 * the conversion range
+		 * Writeback the range to ensure any inode size updates due to
+		 * appending writes make it to disk (otherwise we could just
+		 * punch out the delalloc blocks).
 		 */
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = xfs_bmap_punch_delalloc_range(ip,
-				XFS_B_TO_FSBT(mp, start_boundary),
-				XFS_B_TO_FSB(mp, end_boundary - start_boundary));
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+				start_boundary, end_boundary - 1);
+		if (error)
+			goto out;
 		truncate_pagecache_range(VFS_I(ip), start_boundary,
 					 end_boundary - 1);
 
-- 
cgit v0.10.2


From 52177937e9ac4573391143065b250403d3a6ae4b Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Fri, 3 Oct 2014 09:09:50 +1000
Subject: xfs: xfs_iflush_done checks the wrong log item callback

Commit 3013683 ("xfs: remove all the inodes on a buffer from the AIL
in bulk") made the xfs inode flush callback more efficient by
combining all the inode writes on the buffer and the deletions of
the inode log item from AIL.

The initial loop in this patch should be looping through all
the log items on the buffer to see which items have
xfs_iflush_done as their callback function. But currently,
only the log item passed to the function has its callback
compared to xfs_iflush_done. If the log item pointer passed to
the function does have the xfs_iflush_done callback function,
then all the log items on the buffer are removed from the
li_bio_list on the buffer b_fspriv and could be removed from
the AIL even though they may have not been written yet.

This problem is masked by the fact that currently all inodes on a
buffer will have the same calback function - either xfs_iflush_done
or xfs_istale_done - and hence the bug cannot manifest in any way.
Still, we need to remove the landmine so that if we add new
callbacks in future this doesn't cause us problems.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index de5a7be..63de0b0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -615,7 +615,7 @@ xfs_iflush_done(
 	blip = bp->b_fspriv;
 	prev = NULL;
 	while (blip != NULL) {
-		if (lip->li_cb != xfs_iflush_done) {
+		if (blip->li_cb != xfs_iflush_done) {
 			prev = blip;
 			blip = blip->li_bio_list;
 			continue;
-- 
cgit v0.10.2


From a8b1ee8bafc765ebf029d03c5479a69aebff9693 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 13 Oct 2014 10:21:53 +1100
Subject: xfs: fix agno increment in xfs_inumbers() loop

caused a regression in xfs_inumbers, which in turn broke
xfsdump, causing incomplete dumps.

The loop in xfs_inumbers() needs to fill the user-supplied
buffers, and iterates via xfs_btree_increment, reading new
ags as needed.

But the first time through the loop, if xfs_btree_increment()
succeeds, we continue, which triggers the ++agno at the bottom
of the loop, and we skip to soon to the next ag - without
the proper setup under next_ag to read the next ag.

Fix this by removing the agno increment from the loop conditional,
and only increment agno if we have actually hit the code under
the next_ag: target.

Cc: stable@vger.kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f71be9c..f1deb96 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -639,7 +639,8 @@ next_ag:
 		xfs_buf_relse(agbp);
 		agbp = NULL;
 		agino = 0;
-	} while (++agno < mp->m_sb.sb_agcount);
+		agno++;
+	} while (agno < mp->m_sb.sb_agcount);
 
 	if (!error) {
 		if (bufidx) {
-- 
cgit v0.10.2