From e12070a5dca8bfeee352e9655ae79772a96b32f8 Mon Sep 17 00:00:00 2001
From: Niv Sardi <xaiki@sgi.com>
Date: Thu, 6 Mar 2008 13:43:03 +1100
Subject: [XFS] actually check error returned by xfs_flush_pages, clean up and
 bailout if fails.

SGI-PV: 973041
SGI-Modid: xfs-linux-melb:xfs-kern:30462a

Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2def273..87f6467 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5869,6 +5869,10 @@ xfs_getbmap(
 		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
 		error = xfs_flush_pages(ip, (xfs_off_t)0,
 					       -1, 0, FI_REMAPF);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+		return error;
+		}
 	}
 
 	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
-- 
cgit v0.10.2


From 461aa8a22595e3bd3e6f4dc2894d7c4315ea2bb9 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 6 Mar 2008 13:43:11 +1100
Subject: [XFS] make inode reclaim synchronise with xfs_iflush_done()

On a forced shutdown, xfs_finish_reclaim() will skip flushing the inode.
If the inode flush lock is not already held and there is an outstanding
xfs_iflush_done() then we might free the inode prematurely. By acquiring
and releasing the flush lock we will synchronise with xfs_iflush_done().

SGI-PV: 909874
SGI-Modid: xfs-linux-melb:xfs-kern:30468a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64c5953..ce82a2d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3694,12 +3694,12 @@ xfs_finish_reclaim(
 	 * We get the flush lock regardless, though, just to make sure
 	 * we don't free it while it is being flushed.
 	 */
-	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		if (!locked) {
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_iflock(ip);
-		}
+	if (!locked) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_iflock(ip);
+	}
 
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		if (ip->i_update_core ||
 		    ((ip->i_itemp != NULL) &&
 		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3719,11 @@ xfs_finish_reclaim(
 		ASSERT(ip->i_update_core == 0);
 		ASSERT(ip->i_itemp == NULL ||
 		       ip->i_itemp->ili_format.ilf_fields == 0);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	} else if (locked) {
-		/*
-		 * We are not interested in doing an iflush if we're
-		 * in the process of shutting down the filesystem forcibly.
-		 * So, just reclaim the inode.
-		 */
-		xfs_ifunlock(ip);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
  reclaim:
 	xfs_ireclaim(ip);
 	return 0;
-- 
cgit v0.10.2


From 163d3686bb09d88e2120bffe780a3f2d7cc4c948 Mon Sep 17 00:00:00 2001
From: Donald Douwsma <donaldd@sgi.com>
Date: Thu, 6 Mar 2008 13:43:20 +1100
Subject: [XFS] Remove the xfs_refcache

Remove the xfs_refcache, it was only needed while we were still
building for 2.4 kernels.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30472a

Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4..e514332 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
 /*
  * Feature macros (disable/enable)
  */
-#undef  HAVE_REFCACHE	/* reference cache not needed for NFS in 2.6 */
 #define HAVE_SPLICE	/* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72c..eaa0189 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -240,10 +240,6 @@ typedef struct xfs_inode {
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
-#ifdef HAVE_REFCACHE
-	struct xfs_inode	**i_refcache;	/* ptr to entry in ref cache */
-	struct xfs_inode	*i_release;	/* inode to unref */
-#endif
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a..1c6d40e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
@@ -580,10 +579,8 @@ xfs_rename(
 	 * the vnode references.
 	 */
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (target_ip != NULL) {
-		xfs_refcache_purge_ip(target_ip);
+	if (target_ip != NULL)
 		IRELE(target_ip);
-	}
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7094caf..79bdfb3 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_refcache.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_priv.h"
 #include "xfs_dir2_trace.h"
@@ -157,7 +156,6 @@ xfs_cleanup(void)
 
 	xfs_cleanup_procfs();
 	xfs_sysctl_unregister();
-	xfs_refcache_destroy();
 	xfs_filestream_uninit();
 	xfs_mru_cache_uninit();
 	xfs_acl_zone_destroy(xfs_acl_zone);
@@ -584,11 +582,6 @@ xfs_unmount(
 					0 : DM_FLAGS_UNWANTED;
 	}
 #endif
-	/*
-	 * First blow any referenced inode from this file system
-	 * out of the reference cache, and delete the timer.
-	 */
-	xfs_refcache_purge_mp(mp);
 
 	/*
 	 * Blow away any referenced inode in the filestreams cache.
@@ -652,7 +645,6 @@ xfs_quiesce_fs(
 {
 	int			count = 0, pincount;
 
-	xfs_refcache_purge_mp(mp);
 	xfs_flush_buftarg(mp->m_ddev_targp, 0);
 	xfs_finish_reclaim_all(mp, 0);
 
@@ -1323,18 +1315,6 @@ xfs_syncsub(
 	}
 
 	/*
-	 * If this is the periodic sync, then kick some entries out of
-	 * the reference cache.  This ensures that idle entries are
-	 * eventually kicked out of the cache.
-	 */
-	if (flags & SYNC_REFCACHE) {
-		if (flags & SYNC_WAIT)
-			xfs_refcache_purge_mp(mp);
-		else
-			xfs_refcache_purge_some(mp);
-	}
-
-	/*
 	 * If asked, update the disk superblock with incore counter values if we
 	 * are using non-persistent counters so that they don't get too far out
 	 * of sync if we crash or get a forced shutdown. We don't want to force
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ce82a2d..35ac59d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_rtalloc.h"
-#include "xfs_refcache.h"
 #include "xfs_trans_space.h"
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
@@ -1520,12 +1519,6 @@ xfs_release(
 			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
 	}
 
-#ifdef HAVE_REFCACHE
-	/* If we are in the NFS reference cache then don't do this now */
-	if (ip->i_refcache)
-		return 0;
-#endif
-
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -2449,14 +2442,6 @@ xfs_remove(
 	}
 
 	/*
-	 * Before we drop our extra reference to the inode, purge it
-	 * from the refcache if it is there.  By waiting until afterwards
-	 * to do the IRELE, we ensure that we won't go inactive in the
-	 * xfs_refcache_purge_ip routine (although that would be OK).
-	 */
-	xfs_refcache_purge_ip(ip);
-
-	/*
 	 * If we are using filestreams, kill the stream association.
 	 * If the file is still open it may get a new one but that
 	 * will get killed on last close in xfs_close() so we don't
@@ -2495,14 +2480,6 @@ xfs_remove(
 	cancel_flags |= XFS_TRANS_ABORT;
 	xfs_trans_cancel(tp, cancel_flags);
 
-	/*
-	 * Before we drop our extra reference to the inode, purge it
-	 * from the refcache if it is there.  By waiting until afterwards
-	 * to do the IRELE, we ensure that we won't go inactive in the
-	 * xfs_refcache_purge_ip routine (although that would be OK).
-	 */
-	xfs_refcache_purge_ip(ip);
-
 	IRELE(ip);
 
 	goto std_return;
@@ -3460,16 +3437,7 @@ xfs_rwunlock(
 {
  	if (S_ISDIR(ip->i_d.di_mode))
   		return;
-	if (locktype == VRWLOCK_WRITE) {
-		/*
-		 * In the write case, we may have added a new entry to
-		 * the reference cache.  This might store a pointer to
-		 * an inode to be released in this inode.  If it is there,
-		 * clear the pointer and release the inode after unlocking
-		 * this one.
-		 */
-		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
-	} else {
+	if (locktype != VRWLOCK_WRITE) {
 		ASSERT((locktype == VRWLOCK_READ) ||
 		       (locktype == VRWLOCK_WRITE_DIRECT));
 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-- 
cgit v0.10.2


From e9a56b7cdaf6129892fd7c8d950b71a1a4304bb0 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 6 Mar 2008 13:43:27 +1100
Subject: [XFS] Fix regression due to refcache removal

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30490a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 35ac59d..40b95e3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3437,7 +3437,9 @@ xfs_rwunlock(
 {
  	if (S_ISDIR(ip->i_d.di_mode))
   		return;
-	if (locktype != VRWLOCK_WRITE) {
+	if (locktype == VRWLOCK_WRITE) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	} else {
 		ASSERT((locktype == VRWLOCK_READ) ||
 		       (locktype == VRWLOCK_WRITE_DIRECT));
 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-- 
cgit v0.10.2


From 4ae29b4321b99b711bcfde5527c4fbf249eac60f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:34 +1100
Subject: [XFS] Factor xfs_itobp() and xfs_inotobp().

The only difference between the functions is one passes an inode for the
lookup, the other passes an inode number. However, they don't do the same
validity checking or set all the same state on the buffer that is returned
yet they should.

Factor the functions into a common implementation.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30500a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f43a6e0..6f156fa 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -126,6 +126,85 @@ xfs_inobp_check(
 #endif
 
 /*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_imap_t	*imap,
+	xfs_buf_t	**bpp,
+	uint		buf_flags,
+	uint		imap_flags)
+{
+	int		error;
+	int		i;
+	int		ni;
+	xfs_buf_t	*bp;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   (int)imap->im_len, XFS_BUF_LOCK, &bp);
+	if (error) {
+		cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+				"an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+		return error;
+	}
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 */
+#ifdef DEBUG
+	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else	/* usual case */
+	ni = 1;
+#endif
+
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			if (imap_flags & XFS_IMAP_BULKSTAT) {
+				xfs_trans_brelse(tp, bp);
+				return XFS_ERROR(EINVAL);
+			}
+			XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+						XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+			cmn_err(CE_PANIC,
+					"Device %s - bad inode magic/vsn "
+					"daddr %lld #%d (magic=%x)",
+				XFS_BUFTARG_NAME(mp->m_ddev_targp),
+				(unsigned long long)imap->im_blkno, i,
+				be16_to_cpu(dip->di_core.di_magic));
+#endif
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Mark the buffer as an inode buffer now that it looks good
+	 */
+	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
  * This routine is called to map an inode number within a file
  * system to the buffer containing the on-disk version of the
  * inode.  It returns a pointer to the buffer containing the
@@ -147,72 +226,19 @@ xfs_inotobp(
 	xfs_buf_t	**bpp,
 	int		*offset)
 {
-	int		di_ok;
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	xfs_dinode_t	*dip;
 
-	/*
-	 * Call the space management code to find the location of the
-	 * inode on disk.
-	 */
 	imap.im_blkno = 0;
 	error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
-	if (error != 0) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: xfs_imap()  returned an "
-	"error %d on %s.  Returning error.", error, mp->m_fsname);
+	if (error)
 		return error;
-	}
-
-	/*
-	 * If the inode number maps to a block outside the bounds of the
-	 * file system then return NULL rather than calling read_buf
-	 * and panicing when we get an error from the driver.
-	 */
-	if ((imap.im_blkno + imap.im_len) >
-	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
-	"of the file system %s.  Returning EINVAL.",
-			(unsigned long long)imap.im_blkno,
-			imap.im_len, mp->m_fsname);
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-	 * default to just a read_buf() call.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
 
-	if (error) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: xfs_trans_read_buf()  returned an "
-	"error %d on %s.  Returning error.", error, mp->m_fsname);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+	if (error)
 		return error;
-	}
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
-	di_ok =
-		be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-		XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-	if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
-			XFS_RANDOM_ITOBP_INOTOBP))) {
-		XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
-		xfs_trans_brelse(tp, bp);
-		cmn_err(CE_WARN,
-	"xfs_inotobp: XFS_TEST_ERROR()  returned an "
-	"error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
 
-	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Set *dipp to point to the on-disk inode in the buffer.
-	 */
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	*offset = imap.im_boffset;
@@ -253,41 +279,15 @@ xfs_itobp(
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	int		i;
-	int		ni;
 
 	if (ip->i_blkno == (xfs_daddr_t)0) {
-		/*
-		 * Call the space management code to find the location of the
-		 * inode on disk.
-		 */
 		imap.im_blkno = bno;
-		if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
-					XFS_IMAP_LOOKUP | imap_flags)))
+		error = xfs_imap(mp, tp, ip->i_ino, &imap,
+					XFS_IMAP_LOOKUP | imap_flags);
+		if (error)
 			return error;
 
 		/*
-		 * If the inode number maps to a block outside the bounds
-		 * of the file system then return NULL rather than calling
-		 * read_buf and panicing when we get an error from the
-		 * driver.
-		 */
-		if ((imap.im_blkno + imap.im_len) >
-		    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-					"(imap.im_blkno (0x%llx) "
-					"+ imap.im_len (0x%llx)) > "
-					" XFS_FSB_TO_BB(mp, "
-					"mp->m_sb.sb_dblocks) (0x%llx)",
-					(unsigned long long) imap.im_blkno,
-					(unsigned long long) imap.im_len,
-					XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
-			return XFS_ERROR(EINVAL);
-		}
-
-		/*
 		 * Fill in the fields in the inode that will be used to
 		 * map the inode to its buffer from now on.
 		 */
@@ -305,76 +305,10 @@ xfs_itobp(
 	}
 	ASSERT(bno == 0 || bno == imap.im_blkno);
 
-	/*
-	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-	 * default to just a read_buf() call.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-	if (error) {
-#ifdef DEBUG
-		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-				"xfs_trans_read_buf() returned error %d, "
-				"imap.im_blkno 0x%llx, imap.im_len 0x%llx",
-				error, (unsigned long long) imap.im_blkno,
-				(unsigned long long) imap.im_len);
-#endif /* DEBUG */
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	if (error)
 		return error;
-	}
-
-	/*
-	 * Validate the magic number and version of every inode in the buffer
-	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-	 * No validation is done here in userspace (xfs_repair).
-	 */
-#if !defined(__KERNEL__)
-	ni = 0;
-#elif defined(DEBUG)
-	ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else	/* usual case */
-	ni = 1;
-#endif
-
-	for (i = 0; i < ni; i++) {
-		int		di_ok;
-		xfs_dinode_t	*dip;
 
-		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					(i << mp->m_sb.sb_inodelog));
-		di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-			    XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP,
-						XFS_RANDOM_ITOBP_INOTOBP))) {
-			if (imap_flags & XFS_IMAP_BULKSTAT) {
-				xfs_trans_brelse(tp, bp);
-				return XFS_ERROR(EINVAL);
-			}
-#ifdef DEBUG
-			cmn_err(CE_ALERT,
-					"Device %s - bad inode magic/vsn "
-					"daddr %lld #%d (magic=%x)",
-				XFS_BUFTARG_NAME(mp->m_ddev_targp),
-				(unsigned long long)imap.im_blkno, i,
-				be16_to_cpu(dip->di_core.di_magic));
-#endif
-			XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
-					     mp, dip);
-			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-	}
-
-	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Mark the buffer as an inode buffer now that it looks good
-	 */
-	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
-	/*
-	 * Set *dipp to point to the on-disk inode in the buffer.
-	 */
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	return 0;
@@ -2678,14 +2612,31 @@ xfs_imap(
 	fsbno = imap->im_blkno ?
 		XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
 	error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-	if (error != 0) {
+	if (error)
 		return error;
-	}
+
 	imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
 	imap->im_len = XFS_FSB_TO_BB(mp, len);
 	imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
 	imap->im_ioffset = (ushort)off;
 	imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+			"(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+			" XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+			(unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		return EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From a3f74ffb6d1448d9a8f482e593b80ec15f1695d4 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:42 +1100
Subject: [XFS] Don't block pdflush when writing back inodes

When pdflush is writing back inodes, it can get stuck on inode cluster
buffers that are currently under I/O. This occurs when we write data to
multiple inodes in the same inode cluster at the same time.

Effectively, delayed allocation marks the inode dirty during the data
writeback. Hence if the inode cluster was flushed during the writeback of
the first inode, the writeback of the second inode will block waiting for
the inode cluster write to complete before writing it again for the newly
dirtied inode.

Basically, we want to avoid this from happening so we don't block pdflush
and slow down all of writeback. Hence we introduce a non-blocking async
inode flush flag that pdflush uses. If this flag is set, we use
non-blocking operations (e.g. try locks) whereever we can to avoid
blocking or extra I/O being issued.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30501a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8831d95..cb9ce90 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
 	struct inode		*inode,
 	int			sync)
 {
-	int			error = 0, flags = FLUSH_INODE;
+	int			error = 0;
+	int			flags = 0;
 
 	xfs_itrace_entry(XFS_I(inode));
 	if (sync) {
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418..f200e024 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -73,12 +73,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
  */
 #define FLUSH_SYNC		1	/* wait for flush to complete	*/
-#define FLUSH_INODE		2	/* flush the inode itself	*/
-#define FLUSH_LOG		4	/* force the last log entry for
-					 * this inode out to disk	*/
 
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6f156fa..3c3e9e3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -145,11 +145,16 @@ xfs_imap_to_bp(
 	xfs_buf_t	*bp;
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   (int)imap->im_len, XFS_BUF_LOCK, &bp);
+				   (int)imap->im_len, buf_flags, &bp);
 	if (error) {
-		cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+		if (error != EAGAIN) {
+			cmn_err(CE_WARN,
+				"xfs_imap_to_bp: xfs_trans_read_buf()returned "
 				"an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
+		} else {
+			ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		}
 		return error;
 	}
 
@@ -274,7 +279,8 @@ xfs_itobp(
 	xfs_dinode_t	**dipp,
 	xfs_buf_t	**bpp,
 	xfs_daddr_t	bno,
-	uint		imap_flags)
+	uint		imap_flags,
+	uint		buf_flags)
 {
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
@@ -305,10 +311,17 @@ xfs_itobp(
 	}
 	ASSERT(bno == 0 || bno == imap.im_blkno);
 
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
 	if (error)
 		return error;
 
+	if (!bp) {
+		ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		ASSERT(tp == NULL);
+		*bpp = NULL;
+		return EAGAIN;
+	}
+
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	return 0;
@@ -812,7 +825,7 @@ xfs_iread(
 	 * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
 	 * know that this is a new incore inode.
 	 */
-	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
 	if (error) {
 		kmem_zone_free(xfs_inode_zone, ip);
 		return error;
@@ -1901,7 +1914,7 @@ xfs_iunlink(
 		 * Here we put the head pointer into our next pointer,
 		 * and then we fall through to point the head at us.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error)
 			return error;
 
@@ -2009,7 +2022,7 @@ xfs_iunlink_remove(
 		 * of dealing with the buffer when there is no need to
 		 * change it.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2071,7 +2084,7 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on
 		 * the unlinked list.  Pull us from the list.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2334,7 +2347,7 @@ xfs_ifree(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 	if (error)
 		return error;
 
@@ -2777,38 +2790,41 @@ xfs_iunpin(
 }
 
 /*
- * This is called to wait for the given inode to be unpinned.
- * It will sleep until this happens.  The caller must have the
- * inode locked in at least shared mode so that the buffer cannot
- * be subsequently pinned once someone is waiting for it to be
- * unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
+ * immediately without waiting for the inode to be unpinned.  The caller must
+ * have the inode locked in at least shared mode so that the buffer cannot be
+ * subsequently pinned once someone is waiting for it to be unpinned.
  */
 STATIC void
-xfs_iunpin_wait(
-	xfs_inode_t	*ip)
+__xfs_iunpin_wait(
+	xfs_inode_t	*ip,
+	int		wait)
 {
-	xfs_inode_log_item_t	*iip;
-	xfs_lsn_t	lsn;
+	xfs_inode_log_item_t	*iip = ip->i_itemp;
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
-
-	if (atomic_read(&ip->i_pincount) == 0) {
+	if (atomic_read(&ip->i_pincount) == 0)
 		return;
-	}
 
-	iip = ip->i_itemp;
-	if (iip && iip->ili_last_lsn) {
-		lsn = iip->ili_last_lsn;
-	} else {
-		lsn = (xfs_lsn_t)0;
-	}
+	/* Give the log a push to start the unpinning I/O */
+	xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
+				iip->ili_last_lsn : 0, XFS_LOG_FORCE);
+	if (wait)
+		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
 
-	/*
-	 * Give the log a push so we don't wait here too long.
-	 */
-	xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+static inline void
+xfs_iunpin_wait(
+	xfs_inode_t	*ip)
+{
+	__xfs_iunpin_wait(ip, 1);
+}
 
-	wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+	xfs_inode_t	*ip)
+{
+	__xfs_iunpin_wait(ip, 0);
 }
 
 
@@ -3003,6 +3019,7 @@ xfs_iflush(
 	int			bufwasdelwri;
 	struct hlist_node	*entry;
 	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -3027,11 +3044,21 @@ xfs_iflush(
 	}
 
 	/*
-	 * We can't flush the inode until it is unpinned, so
-	 * wait for it.  We know noone new can pin it, because
-	 * we are holding the inode lock shared and you need
-	 * to hold it exclusively to pin the inode.
+	 * We can't flush the inode until it is unpinned, so wait for it if we
+	 * are allowed to block.  We know noone new can pin it, because we are
+	 * holding the inode lock shared and you need to hold it exclusively to
+	 * pin the inode.
+	 *
+	 * If we are not allowed to block, force the log out asynchronously so
+	 * that when we come back the inode will be unpinned. If other inodes
+	 * in the same cluster are dirty, they will probably write the inode
+	 * out for us if they occur after the log force completes.
 	 */
+	if (noblock && xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		xfs_ifunlock(ip);
+		return EAGAIN;
+	}
 	xfs_iunpin_wait(ip);
 
 	/*
@@ -3048,15 +3075,6 @@ xfs_iflush(
 	}
 
 	/*
-	 * Get the buffer containing the on-disk inode.
-	 */
-	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-	if (error) {
-		xfs_ifunlock(ip);
-		return error;
-	}
-
-	/*
 	 * Decide how buffer will be flushed out.  This is done before
 	 * the call to xfs_iflush_int because this field is zeroed by it.
 	 */
@@ -3072,6 +3090,7 @@ xfs_iflush(
 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
 			flags = 0;
 			break;
+		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
 			flags = INT_ASYNC;
@@ -3091,6 +3110,7 @@ xfs_iflush(
 		case XFS_IFLUSH_DELWRI:
 			flags = INT_DELWRI;
 			break;
+		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 			flags = INT_ASYNC;
 			break;
@@ -3105,6 +3125,16 @@ xfs_iflush(
 	}
 
 	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+				noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+	if (error || !bp) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
+	/*
 	 * First flush out the inode that xfs_iflush was called with.
 	 */
 	error = xfs_iflush_int(ip, bp);
@@ -3113,6 +3143,13 @@ xfs_iflush(
 	}
 
 	/*
+	 * If the buffer is pinned then push on the log now so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp))
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+
+	/*
 	 * inode clustering:
 	 * see if other inodes can be gathered into this write
 	 */
@@ -3181,14 +3218,6 @@ xfs_iflush(
 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
 	}
 
-	/*
-	 * If the buffer is pinned then push on the log so we won't
-	 * get stuck waiting in the write for too long.
-	 */
-	if (XFS_BUF_ISPINNED(bp)){
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-	}
-
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & INT_ASYNC) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eaa0189..c3bfffc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -457,6 +457,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define	XFS_IFLUSH_SYNC			3
 #define	XFS_IFLUSH_ASYNC		4
 #define	XFS_IFLUSH_DELWRI		5
+#define	XFS_IFLUSH_ASYNC_NOBLOCK	6
 
 /*
  * Flags for xfs_itruncate_start().
@@ -511,7 +512,7 @@ int		xfs_finish_reclaim_all(struct xfs_mount *, int);
  */
 int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-			  xfs_daddr_t, uint);
+			  xfs_daddr_t, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f615e04..45d8776 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -614,7 +614,8 @@ xfs_bulkstat(
 							xfs_buf_relse(bp);
 						error = xfs_itobp(mp, NULL, ip,
 								&dip, &bp, bno,
-								XFS_IMAP_BULKSTAT);
+								XFS_IMAP_BULKSTAT,
+								XFS_BUF_LOCK);
 						if (!error)
 							clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
 						kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b2b70eb..cd24711 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3214,7 +3214,8 @@ xlog_recover_process_iunlinks(
 					 * next inode in the bucket.
 					 */
 					error = xfs_itobp(mp, NULL, ip, &dip,
-							&ibp, 0, 0);
+							&ibp, 0, 0,
+							XFS_BUF_LOCK);
 					ASSERT(error || (dip != NULL));
 				}
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b89..4e5c010 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
 	if (tp == NULL) {
 		bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
 		if (!bp)
-			return XFS_ERROR(ENOMEM);
+			return (flags & XFS_BUF_TRYLOCK) ?
+					EAGAIN : XFS_ERROR(ENOMEM);
 
 		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
 			xfs_ioerror_alert("xfs_trans_read_buf", mp,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 40b95e3..14140f6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3468,29 +3468,6 @@ xfs_inode_flush(
 	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
 		return 0;
 
-	if (flags & FLUSH_LOG) {
-		if (iip && iip->ili_last_lsn) {
-			xlog_t		*log = mp->m_log;
-			xfs_lsn_t	sync_lsn;
-			int		log_flags = XFS_LOG_FORCE;
-
-			spin_lock(&log->l_grant_lock);
-			sync_lsn = log->l_last_sync_lsn;
-			spin_unlock(&log->l_grant_lock);
-
-			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-				if (flags & FLUSH_SYNC)
-					log_flags |= XFS_LOG_SYNC;
-				error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
-				if (error)
-					return error;
-			}
-
-			if (ip->i_update_core == 0)
-				return 0;
-		}
-	}
-
 	/*
 	 * We make this non-blocking if the inode is contended,
 	 * return EAGAIN to indicate to the caller that they
@@ -3498,30 +3475,22 @@ xfs_inode_flush(
 	 * blocking on inodes inside another operation right
 	 * now, they get caught later by xfs_sync.
 	 */
-	if (flags & FLUSH_INODE) {
-		int	flush_flags;
-
-		if (flags & FLUSH_SYNC) {
-			xfs_ilock(ip, XFS_ILOCK_SHARED);
-			xfs_iflock(ip);
-		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-				xfs_iunlock(ip, XFS_ILOCK_SHARED);
-				return EAGAIN;
-			}
-		} else {
+	if (flags & FLUSH_SYNC) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		xfs_iflock(ip);
+	} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			return EAGAIN;
 		}
-
-		if (flags & FLUSH_SYNC)
-			flush_flags = XFS_IFLUSH_SYNC;
-		else
-			flush_flags = XFS_IFLUSH_ASYNC;
-
-		error = xfs_iflush(ip, flush_flags);
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	} else {
+		return EAGAIN;
 	}
 
+	error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+						    : XFS_IFLUSH_ASYNC_NOBLOCK);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
 	return error;
 }
 
-- 
cgit v0.10.2


From bad5584332e888ac40ca13584e8c114149ddb01e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:49 +1100
Subject: [XFS] Remove the xfs_icluster structure

Remove the xfs_icluster structure and replace with a radix tree lookup.

We don't need to keep a list of inodes in each cluster around anymore as
we can look them up quickly when we need to. The only time we need to do
this now is during inode writeback.

Factor the inode cluster writeback code out of xfs_iflush and convert it
to use radix_tree_gang_lookup() instead of walking a list of inodes built
when we first read in the inodes.

This remove 3 pointers from each xfs_inode structure and the xfs_icluster
structure per inode cluster. Hence we reduce the cache footprint of the
xfs_inodes by between 5-10% depending on cluster sparseness.

To be truly efficient we need a radix_tree_gang_lookup_range() call to
stop searching once we are past the end of the cluster instead of trying
to find a full cluster's worth of inodes.

Before (ia64):

$ cat /sys/slab/xfs_inode/object_size 536

After:

$ cat /sys/slab/xfs_inode/object_size 512

SGI-PV: 977460
SGI-Modid: xfs-linux-melb:xfs-kern:30502a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8e09b71..e657c51 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
 	xfs_inode_t	*ip;
 	xfs_inode_t	*iq;
 	int		error;
-	xfs_icluster_t	*icl, *new_icl = NULL;
 	unsigned long	first_index, mask;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
@@ -229,11 +228,9 @@ finish_inode:
 	}
 
 	/*
-	 * This is a bit messy - we preallocate everything we _might_
-	 * need before we pick up the ici lock. That way we don't have to
-	 * juggle locks and go all the way back to the start.
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock.
 	 */
-	new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
 	if (radix_tree_preload(GFP_KERNEL)) {
 		xfs_idestroy(ip);
 		delay(1);
@@ -242,17 +239,6 @@ finish_inode:
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = agino & mask;
 	write_lock(&pag->pag_ici_lock);
-
-	/*
-	 * Find the cluster if it exists
-	 */
-	icl = NULL;
-	if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
-							first_index, 1)) {
-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
-			icl = iq->i_cluster;
-	}
-
 	/*
 	 * insert the new inode
 	 */
@@ -267,30 +253,13 @@ finish_inode:
 	}
 
 	/*
-	 * These values _must_ be set before releasing ihlock!
+	 * These values _must_ be set before releasing the radix tree lock!
 	 */
 	ip->i_udquot = ip->i_gdquot = NULL;
 	xfs_iflags_set(ip, XFS_INEW);
 
-	ASSERT(ip->i_cluster == NULL);
-
-	if (!icl) {
-		spin_lock_init(&new_icl->icl_lock);
-		INIT_HLIST_HEAD(&new_icl->icl_inodes);
-		icl = new_icl;
-		new_icl = NULL;
-	} else {
-		ASSERT(!hlist_empty(&icl->icl_inodes));
-	}
-	spin_lock(&icl->icl_lock);
-	hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
-	ip->i_cluster = icl;
-	spin_unlock(&icl->icl_lock);
-
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
-	if (new_icl)
-		kmem_zone_free(xfs_icluster_zone, new_icl);
 
 	/*
 	 * Link ip to its mount and thread it on the mount's inode list.
@@ -529,18 +498,6 @@ xfs_iextract(
 	xfs_put_perag(mp, pag);
 
 	/*
-	 * Remove from cluster list
-	 */
-	mp = ip->i_mount;
-	spin_lock(&ip->i_cluster->icl_lock);
-	hlist_del(&ip->i_cnode);
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	/* was last inode in cluster? */
-	if (hlist_empty(&ip->i_cluster->icl_inodes))
-		kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-
-	/*
 	 * Remove from mount's inode list.
 	 */
 	XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3c3e9e3..040c0e4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_icluster_zone;
 
 /*
  * Used in xfs_itruncate().  This is the maximum number of extents
@@ -2994,6 +2993,153 @@ xfs_iflush_fork(
 	return 0;
 }
 
+STATIC int
+xfs_iflush_cluster(
+	xfs_inode_t	*ip,
+	xfs_buf_t	*bp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
+	unsigned long		first_index, mask;
+	int			ilist_size;
+	xfs_inode_t		**ilist;
+	xfs_inode_t		*iq;
+	xfs_inode_log_item_t	*iip;
+	int			nr_found;
+	int			clcount = 0;
+	int			bufwasdelwri;
+	int			i;
+
+	ASSERT(pag->pagi_inodeok);
+	ASSERT(pag->pag_ici_init);
+
+	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+	if (!ilist)
+		return 0;
+
+	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+	read_lock(&pag->pag_ici_lock);
+	/* really need a gang lookup range call here */
+	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+					first_index,
+					XFS_INODE_CLUSTER_SIZE(mp));
+	if (nr_found == 0)
+		goto out_free;
+
+	for (i = 0; i < nr_found; i++) {
+		iq = ilist[i];
+		if (iq == ip)
+			continue;
+		/* if the inode lies outside this cluster, we're done. */
+		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+			break;
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.  These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core == 0) &&
+		    ((iip == NULL) ||
+		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+		      xfs_ipincount(iq) == 0) {
+			continue;
+		}
+
+		/*
+		 * Try to get locks.  If any are unavailable or it is pinned,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+			continue;
+		if (!xfs_iflock_nowait(iq)) {
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+		if (xfs_ipincount(iq)) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
+		/*
+		 * arriving here means that this inode can be flushed.  First
+		 * re-check that it's dirty before flushing.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core != 0) || ((iip != NULL) &&
+		     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+			int error;
+			error = xfs_iflush_int(iq, bp);
+			if (error) {
+				xfs_iunlock(iq, XFS_ILOCK_SHARED);
+				goto cluster_corrupt_out;
+			}
+			clcount++;
+		} else {
+			xfs_ifunlock(iq);
+		}
+		xfs_iunlock(iq, XFS_ILOCK_SHARED);
+	}
+
+	if (clcount) {
+		XFS_STATS_INC(xs_icluster_flushcnt);
+		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+	}
+
+out_free:
+	read_unlock(&pag->pag_ici_lock);
+	kmem_free(ilist, ilist_size);
+	return 0;
+
+
+cluster_corrupt_out:
+	/*
+	 * Corruption detected in the clustering loop.  Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	read_unlock(&pag->pag_ici_lock);
+	/*
+	 * Clean up the buffer.  If it was B_DELWRI, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+	if (bufwasdelwri)
+		xfs_buf_relse(bp);
+
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+	if (!bufwasdelwri) {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp)) {
+			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_STALE(bp);
+			XFS_BUF_SHUT(bp);
+			XFS_BUF_ERROR(bp,EIO);
+			xfs_biodone(bp);
+		} else {
+			XFS_BUF_STALE(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(iq);
+	kmem_free(ilist, ilist_size);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
 /*
  * xfs_iflush() will write a modified inode's changes out to the
  * inode's on disk home.  The caller must have the inode lock held
@@ -3013,13 +3159,8 @@ xfs_iflush(
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 	int			error;
-	/* REFERENCED */
-	xfs_inode_t		*iq;
-	int			clcount;	/* count of inodes clustered */
-	int			bufwasdelwri;
-	struct hlist_node	*entry;
-	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
+	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -3138,9 +3279,8 @@ xfs_iflush(
 	 * First flush out the inode that xfs_iflush was called with.
 	 */
 	error = xfs_iflush_int(ip, bp);
-	if (error) {
+	if (error)
 		goto corrupt_out;
-	}
 
 	/*
 	 * If the buffer is pinned then push on the log now so we won't
@@ -3153,70 +3293,9 @@ xfs_iflush(
 	 * inode clustering:
 	 * see if other inodes can be gathered into this write
 	 */
-	spin_lock(&ip->i_cluster->icl_lock);
-	ip->i_cluster->icl_buf = bp;
-
-	clcount = 0;
-	hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
-		if (iq == ip)
-			continue;
-
-		/*
-		 * Do an un-protected check to see if the inode is dirty and
-		 * is a candidate for flushing.  These checks will be repeated
-		 * later after the appropriate locks are acquired.
-		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core == 0) &&
-		    ((iip == NULL) ||
-		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		      xfs_ipincount(iq) == 0) {
-			continue;
-		}
-
-		/*
-		 * Try to get locks.  If any are unavailable,
-		 * then this inode cannot be flushed and is skipped.
-		 */
-
-		/* get inode locks (just i_lock) */
-		if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
-			/* get inode flush lock */
-			if (xfs_iflock_nowait(iq)) {
-				/* check if pinned */
-				if (xfs_ipincount(iq) == 0) {
-					/* arriving here means that
-					 * this inode can be flushed.
-					 * first re-check that it's
-					 * dirty
-					 */
-					iip = iq->i_itemp;
-					if ((iq->i_update_core != 0)||
-					    ((iip != NULL) &&
-					     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-						clcount++;
-						error = xfs_iflush_int(iq, bp);
-						if (error) {
-							xfs_iunlock(iq,
-								    XFS_ILOCK_SHARED);
-							goto cluster_corrupt_out;
-						}
-					} else {
-						xfs_ifunlock(iq);
-					}
-				} else {
-					xfs_ifunlock(iq);
-				}
-			}
-			xfs_iunlock(iq, XFS_ILOCK_SHARED);
-		}
-	}
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	if (clcount) {
-		XFS_STATS_INC(xs_icluster_flushcnt);
-		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
-	}
+	error = xfs_iflush_cluster(ip, bp);
+	if (error)
+		goto cluster_corrupt_out;
 
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
@@ -3230,52 +3309,11 @@ xfs_iflush(
 corrupt_out:
 	xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-	xfs_iflush_abort(ip);
-	/*
-	 * Unlocks the flush lock
-	 */
-	return XFS_ERROR(EFSCORRUPTED);
-
 cluster_corrupt_out:
-	/* Corruption detected in the clustering loop.  Invalidate the
-	 * inode buffer and shut down the filesystem.
-	 */
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	/*
-	 * Clean up the buffer.  If it was B_DELWRI, just release it --
-	 * brelse can handle it with no problems.  If not, shut down the
-	 * filesystem before releasing the buffer.
-	 */
-	if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
-		xfs_buf_relse(bp);
-	}
-
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
-	if(!bufwasdelwri)  {
-		/*
-		 * Just like incore_relse: if we have b_iodone functions,
-		 * mark the buffer as an error and call them.  Otherwise
-		 * mark it as stale and brelse.
-		 */
-		if (XFS_BUF_IODONE_FUNC(bp)) {
-			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-			XFS_BUF_UNDONE(bp);
-			XFS_BUF_STALE(bp);
-			XFS_BUF_SHUT(bp);
-			XFS_BUF_ERROR(bp,EIO);
-			xfs_biodone(bp);
-		} else {
-			XFS_BUF_STALE(bp);
-			xfs_buf_relse(bp);
-		}
-	}
-
-	xfs_iflush_abort(iq);
 	/*
 	 * Unlocks the flush lock
 	 */
+	xfs_iflush_abort(ip);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c3bfffc..93c3769 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -133,19 +133,6 @@ typedef struct dm_attrs_s {
 } dm_attrs_t;
 
 /*
- * This is the xfs inode cluster structure.  This structure is used by
- * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
- * the same time.
- */
-typedef struct xfs_icluster {
-	struct hlist_head	icl_inodes;	/* list of inodes on cluster */
-	xfs_daddr_t		icl_blkno;	/* starting block number of
-						 * the cluster */
-	struct xfs_buf		*icl_buf;	/* the inode buffer */
-	spinlock_t		icl_lock;	/* inode list lock */
-} xfs_icluster_t;
-
-/*
  * This is the xfs in-core inode structure.
  * Most of the on-disk inode is embedded in the i_d field.
  *
@@ -248,8 +235,6 @@ typedef struct xfs_inode {
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
-	xfs_icluster_t		*i_cluster;	/* cluster list header */
-	struct hlist_node	i_cnode;	/* cluster link node */
 
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
@@ -594,7 +579,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #define	xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
 
-extern struct kmem_zone	*xfs_icluster_zone;
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 79bdfb3..3ec27bf 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -112,9 +112,6 @@ xfs_init(void)
 	xfs_ili_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
 					KM_ZONE_SPREAD, NULL);
-	xfs_icluster_zone =
-		kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
-					KM_ZONE_SPREAD, NULL);
 
 	/*
 	 * Allocate global trace buffers.
@@ -152,7 +149,6 @@ xfs_cleanup(void)
 	extern kmem_zone_t	*xfs_inode_zone;
 	extern kmem_zone_t	*xfs_efd_zone;
 	extern kmem_zone_t	*xfs_efi_zone;
-	extern kmem_zone_t	*xfs_icluster_zone;
 
 	xfs_cleanup_procfs();
 	xfs_sysctl_unregister();
@@ -187,7 +183,6 @@ xfs_cleanup(void)
 	kmem_zone_destroy(xfs_efi_zone);
 	kmem_zone_destroy(xfs_ifork_zone);
 	kmem_zone_destroy(xfs_ili_zone);
-	kmem_zone_destroy(xfs_icluster_zone);
 }
 
 /*
-- 
cgit v0.10.2


From 3354040897f828644be6ca5783588e9f64a53b8e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:59 +1100
Subject: [XFS] Use xfs_inode_clean() in more places

Remove open coded checks for the whether the inode is clean and replace
them with an inlined function.

SGI-PV: 977461
SGI-Modid: xfs-linux-melb:xfs-kern:30503a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 040c0e4..d7514f8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2118,13 +2118,6 @@ xfs_iunlink_remove(
 	return 0;
 }
 
-STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
-{
-	return (((ip->i_itemp == NULL) ||
-		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		(ip->i_update_core == 0));
-}
-
 STATIC void
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
@@ -3004,7 +2997,6 @@ xfs_iflush_cluster(
 	int			ilist_size;
 	xfs_inode_t		**ilist;
 	xfs_inode_t		*iq;
-	xfs_inode_log_item_t	*iip;
 	int			nr_found;
 	int			clcount = 0;
 	int			bufwasdelwri;
@@ -3040,13 +3032,8 @@ xfs_iflush_cluster(
 		 * is a candidate for flushing.  These checks will be repeated
 		 * later after the appropriate locks are acquired.
 		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core == 0) &&
-		    ((iip == NULL) ||
-		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		      xfs_ipincount(iq) == 0) {
+		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
 			continue;
-		}
 
 		/*
 		 * Try to get locks.  If any are unavailable or it is pinned,
@@ -3069,10 +3056,8 @@ xfs_iflush_cluster(
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
 		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core != 0) || ((iip != NULL) &&
-		     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-			int error;
+		if (!xfs_inode_clean(iq)) {
+			int	error;
 			error = xfs_iflush_int(iq, bp);
 			if (error) {
 				xfs_iunlock(iq, XFS_ILOCK_SHARED);
@@ -3176,8 +3161,7 @@ xfs_iflush(
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+	if (xfs_inode_clean(ip)) {
 		ASSERT((iip != NULL) ?
 			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
 		xfs_ifunlock(ip);
@@ -3343,8 +3327,7 @@ xfs_iflush_int(
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea..4051307 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
 
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+	return (!ip->i_itemp ||
+		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+	       !ip->i_update_core;
+}
+
+
 #ifdef __KERNEL__
 
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 14140f6..5390d12 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3454,7 +3454,6 @@ xfs_inode_flush(
 	int		flags)
 {
 	xfs_mount_t	*mp = ip->i_mount;
-	xfs_inode_log_item_t *iip = ip->i_itemp;
 	int		error = 0;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -3464,8 +3463,7 @@ xfs_inode_flush(
 	 * Bypass inodes which have already been cleaned by
 	 * the inode flush clustering code inside xfs_iflush
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
+	if (xfs_inode_clean(ip))
 		return 0;
 
 	/*
-- 
cgit v0.10.2


From b589334c7a1fff85d2f009d5db4c34fad48925e9 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:44:06 +1100
Subject: [XFS] Prevent AIL lock contention during transaction completion

When hundreds of processors attempt to commit transactions at the same
time, they can contend on the AIL lock when updating the tail LSN held in
the in-core log structure.

At the moment, the tail LSN is only needed when actually writing out an
iclog, so it really does not need to be updated on every single
transaction completion - only those that result in switching iclogs and
flushing them to disk.

The result is that we reduce the number of times we need to grab the AIL
lock and the log grant lock by up to two orders of magnitude on large
processor count machines. The problem has previously been hidden by AIL
lock contention walking the AIL list which was recently solved and
uncovered this issue.

SGI-PV: 975671
SGI-Modid: xfs-linux-melb:xfs-kern:30504a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 31f2b04..2e35077 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2813,15 +2813,13 @@ xlog_state_put_ticket(xlog_t	    *log,
  *
  */
 STATIC int
-xlog_state_release_iclog(xlog_t		*log,
-			 xlog_in_core_t	*iclog)
+xlog_state_release_iclog(
+	xlog_t		*log,
+	xlog_in_core_t	*iclog)
 {
 	int		sync = 0;	/* do we sync? */
 
-	xlog_assign_tail_lsn(log->l_mp);
-
 	spin_lock(&log->l_icloglock);
-
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
 		return XFS_ERROR(EIO);
@@ -2833,13 +2831,14 @@ xlog_state_release_iclog(xlog_t		*log,
 
 	if (--iclog->ic_refcnt == 0 &&
 	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		/* update tail before writing to iclog */
+		xlog_assign_tail_lsn(log->l_mp);
 		sync++;
 		iclog->ic_state = XLOG_STATE_SYNCING;
 		iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
 		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
 		/* cycle incremented when incrementing curr_block */
 	}
-
 	spin_unlock(&log->l_icloglock);
 
 	/*
@@ -2849,11 +2848,9 @@ xlog_state_release_iclog(xlog_t		*log,
 	 * this iclog has consistent data, so we ignore IOERROR
 	 * flags after this point.
 	 */
-	if (sync) {
+	if (sync)
 		return xlog_sync(log, iclog);
-	}
 	return 0;
-
 }	/* xlog_state_release_iclog */
 
 
-- 
cgit v0.10.2


From 155cc6b784a959ed456fe46dca522e1d28b3b718 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:44:14 +1100
Subject: [XFS] Use atomics for iclog reference counting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we update the log tail LSN less frequently on transaction
completion, we pass the contention straight to the global log state lock
(l_iclog_lock) during transaction completion.

We currently have to take this lock to decrement the iclog reference
count. there is a reference count on each iclog, so we need to take �he
global lock for all refcount changes.

When large numbers of processes are all doing small trnasctions, the iclog
reference counts will be quite high, and the state change that absolutely
requires the l_iclog_lock is the except rather than the norm.

Change the reference counting on the iclogs to use atomic_inc/dec so that
we can use atomic_dec_and_lock during transaction completion and avoid the
need for grabbing the l_iclog_lock for every reference count decrement
except the one that matters - the last.

SGI-PV: 975671
SGI-Modid: xfs-linux-melb:xfs-kern:30505a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2e35077..1fa9809 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -675,7 +675,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 
 		spin_lock(&log->l_icloglock);
 		iclog = log->l_iclog;
-		iclog->ic_refcnt++;
+		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 		xlog_state_want_sync(log, iclog);
 		(void) xlog_state_release_iclog(log, iclog);
@@ -713,7 +713,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		 */
 		spin_lock(&log->l_icloglock);
 		iclog = log->l_iclog;
-		iclog->ic_refcnt++;
+		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 
 		xlog_state_want_sync(log, iclog);
@@ -1405,7 +1405,7 @@ xlog_sync(xlog_t		*log,
 	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
 
 	XFS_STATS_INC(xs_log_writes);
-	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 
 	/* Add for LR header */
 	count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -2309,7 +2309,7 @@ xlog_state_done_syncing(
 
 	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
 	       iclog->ic_state == XLOG_STATE_IOERROR);
-	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
 
 
@@ -2391,7 +2391,7 @@ restart:
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
 	head = &iclog->ic_header;
 
-	iclog->ic_refcnt++;			/* prevents sync */
+	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
 	log_offset = iclog->ic_offset;
 
 	/* On the 1st write to an iclog, figure out lsn.  This works
@@ -2423,12 +2423,12 @@ restart:
 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
 
 		/* If I'm the only one writing to this iclog, sync it to disk */
-		if (iclog->ic_refcnt == 1) {
+		if (atomic_read(&iclog->ic_refcnt) == 1) {
 			spin_unlock(&log->l_icloglock);
 			if ((error = xlog_state_release_iclog(log, iclog)))
 				return error;
 		} else {
-			iclog->ic_refcnt--;
+			atomic_dec(&iclog->ic_refcnt);
 			spin_unlock(&log->l_icloglock);
 		}
 		goto restart;
@@ -2819,18 +2819,21 @@ xlog_state_release_iclog(
 {
 	int		sync = 0;	/* do we sync? */
 
-	spin_lock(&log->l_icloglock);
+	if (iclog->ic_state & XLOG_STATE_IOERROR)
+		return XFS_ERROR(EIO);
+
+	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+		return 0;
+
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
 		return XFS_ERROR(EIO);
 	}
-
-	ASSERT(iclog->ic_refcnt > 0);
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
 	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
 
-	if (--iclog->ic_refcnt == 0 &&
-	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
 		/* update tail before writing to iclog */
 		xlog_assign_tail_lsn(log->l_mp);
 		sync++;
@@ -2950,7 +2953,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 		 * previous iclog and go to sleep.
 		 */
 		if (iclog->ic_state == XLOG_STATE_DIRTY ||
-		    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+		    (atomic_read(&iclog->ic_refcnt) == 0
+		     && iclog->ic_offset == 0)) {
 			iclog = iclog->ic_prev;
 			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
 			    iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2958,14 +2962,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 			else
 				goto maybe_sleep;
 		} else {
-			if (iclog->ic_refcnt == 0) {
+			if (atomic_read(&iclog->ic_refcnt) == 0) {
 				/* We are the only one with access to this
 				 * iclog.  Flush it out now.  There should
 				 * be a roundoff of zero to show that someone
 				 * has already taken care of the roundoff from
 				 * the previous sync.
 				 */
-				iclog->ic_refcnt++;
+				atomic_inc(&iclog->ic_refcnt);
 				lsn = be64_to_cpu(iclog->ic_header.h_lsn);
 				xlog_state_switch_iclogs(log, iclog, 0);
 				spin_unlock(&log->l_icloglock);
@@ -3097,7 +3101,7 @@ try_again:
 			already_slept = 1;
 			goto try_again;
 		} else {
-			iclog->ic_refcnt++;
+			atomic_inc(&iclog->ic_refcnt);
 			xlog_state_switch_iclogs(log, iclog, 0);
 			spin_unlock(&log->l_icloglock);
 			if (xlog_state_release_iclog(log, iclog))
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c6244cc..01c63db 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -339,7 +339,7 @@ typedef struct xlog_iclog_fields {
 #endif
 	int			ic_size;
 	int			ic_offset;
-	int			ic_refcnt;
+	atomic_t		ic_refcnt;
 	int			ic_bwritecnt;
 	ushort_t		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
-- 
cgit v0.10.2


From db0bb7baa1533db156d8af3ebeda1f0473a0197a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:35 +1100
Subject: [XFS] cleanup xfs_vn_mknod

- use proper goto based unwinding instead of the current mess of
  multiple conditionals
- rename ip to inode because that's the normal convention for Linux
  inodes while ip is the convention for xfs_inodes
- remove unlikely checks for the default_acl - branches marked unlikely
  might lead to extreme branch bredictor slowdons if taken and for some
  workloads a default acl is quite common
- properly indent the switch statements
- remove xfs_has_fs_struct as nfsd has a fs_struct in any semi-recent
  kernel

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30529a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3..3467011 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -241,18 +241,6 @@ xfs_init_security(
 	return error;
 }
 
-/*
- * Determine whether a process has a valid fs_struct (kernel daemons
- * like knfsd don't have an fs_struct).
- *
- * XXX(hch):  nfsd is broken, better fix it instead.
- */
-STATIC_INLINE int
-xfs_has_fs_struct(struct task_struct *task)
-{
-	return (task->fs != init_task.fs);
-}
-
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
@@ -284,7 +272,7 @@ xfs_vn_mknod(
 	int		mode,
 	dev_t		rdev)
 {
-	struct inode	*ip;
+	struct inode	*inode;
 	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
@@ -297,7 +285,7 @@ xfs_vn_mknod(
 	if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
 		return -EINVAL;
 
-	if (unlikely(test_default_acl && test_default_acl(dvp))) {
+	if (test_default_acl && test_default_acl(dvp)) {
 		if (!_ACL_ALLOC(default_acl)) {
 			return -ENOMEM;
 		}
@@ -307,11 +295,14 @@ xfs_vn_mknod(
 		}
 	}
 
-	if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current))
+	if (IS_POSIXACL(dir) && !default_acl)
 		mode &= ~current->fs->umask;
 
 	switch (mode & S_IFMT) {
-	case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
 		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
@@ -324,32 +315,34 @@ xfs_vn_mknod(
 		break;
 	}
 
-	if (unlikely(!error)) {
-		error = xfs_init_security(vp, dir);
-		if (error)
-			xfs_cleanup_inode(dir, vp, dentry, mode);
-	}
+	if (unlikely(error))
+		goto out_free_acl;
 
-	if (unlikely(default_acl)) {
-		if (!error) {
-			error = _ACL_INHERIT(vp, mode, default_acl);
-			if (!error)
-				xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
-			else
-				xfs_cleanup_inode(dir, vp, dentry, mode);
-		}
+	error = xfs_init_security(vp, dir);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	if (default_acl) {
+		error = _ACL_INHERIT(vp, mode, default_acl);
+		if (unlikely(error))
+			goto out_cleanup_inode;
+		xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
 		_ACL_FREE(default_acl);
 	}
 
-	if (likely(!error)) {
-		ASSERT(vp);
-		ip = vn_to_inode(vp);
+	inode = vn_to_inode(vp);
 
-		if (S_ISDIR(mode))
-			xfs_validate_fields(ip);
-		d_instantiate(dentry, ip);
-		xfs_validate_fields(dir);
-	}
+	if (S_ISDIR(mode))
+		xfs_validate_fields(inode);
+	d_instantiate(dentry, inode);
+	xfs_validate_fields(dir);
+	return -error;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, vp, dentry, mode);
+ out_free_acl:
+	if (default_acl)
+		_ACL_FREE(default_acl);
 	return -error;
 }
 
-- 
cgit v0.10.2


From a8b3acd57e3aaaf73a863a28e0e9f6cca37cd8e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:41 +1100
Subject: [XFS] vnode cleanup in xfs_fs_subr.c

Cleanup the unneeded intermediate vnode step in the flushing helpers and
go directly from the xfs_inode to the struct address_space.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30530a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34c..1eefe61 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_vnodeops.h"
-
-/*
- * The following six includes are needed so that we can include
- * xfs_inode.h.  What a mess..
- */
 #include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-
 #include "xfs_inode.h"
 
 int  fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 
-	if (VN_CACHED(vp))
-		truncate_inode_pages(inode->i_mapping, first);
+	if (mapping->nrpages)
+		truncate_inode_pages(mapping, first);
 }
 
 int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 	int		ret = 0;
 
-	if (VN_CACHED(vp)) {
+	if (mapping->nrpages) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_write_and_wait(inode->i_mapping);
+		ret = filemap_write_and_wait(mapping);
 		if (!ret)
-			truncate_inode_pages(inode->i_mapping, first);
+			truncate_inode_pages(mapping, first);
 	}
 	return ret;
 }
@@ -77,17 +64,16 @@ xfs_flush_pages(
 	uint64_t	flags,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 	int		ret = 0;
 	int		ret2;
 
-	if (VN_DIRTY(vp)) {
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(inode->i_mapping);
+		ret = filemap_fdatawrite(mapping);
 		if (flags & XFS_B_ASYNC)
 			return ret;
-		ret2 = filemap_fdatawait(inode->i_mapping);
+		ret2 = filemap_fdatawait(mapping);
 		if (!ret)
 			ret = ret2;
 	}
-- 
cgit v0.10.2


From 43973964a386348af0a392266f008ba24170aa30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:50 +1100
Subject: [XFS] kill xfs_get_dir_entry

Instead of of xfs_get_dir_entry use a macro to get the xfs_inode from the
dentry in the callers and grab the reference manually.

Only grab the reference once as it's fine to keep it over the dmapi calls.
(And even that reference is actually superflous in Linux but I'll leave
that for another patch)

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30531a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f200e024..2022318 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -227,7 +227,7 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
  */
 #define VNAME(dentry)		((char *) (dentry)->d_name.name)
 #define VNAMELEN(dentry)	((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry)	(vn_from_inode((dentry)->d_inode))
+#define VNAME_TO_INODE(dentry)	(XFS_I((dentry)->d_inode))
 
 /*
  * Dealing with bad inodes
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 1c6d40e..fd1244c 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -93,7 +93,8 @@ xfs_lock_for_rename(
 	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
 	int		*num_inodes)  /* number of inodes in array */
 {
-	xfs_inode_t		*ip1, *ip2, *temp;
+	xfs_inode_t		*ip1 = VNAME_TO_INODE(vname1);
+	xfs_inode_t		*ip2, *temp;
 	xfs_ino_t		inum1, inum2;
 	int			error;
 	int			i, j;
@@ -109,16 +110,11 @@ xfs_lock_for_rename(
 	 * to see if we still have the right inodes, directories, etc.
 	 */
 	lock_mode = xfs_ilock_map_shared(dp1);
-	error = xfs_get_dir_entry(vname1, &ip1);
-	if (error) {
-		xfs_iunlock_map_shared(dp1, lock_mode);
-		return error;
-	}
+	IHOLD(ip1);
+	xfs_itrace_ref(ip1);
 
 	inum1 = ip1->i_ino;
 
-	ASSERT(ip1);
-	xfs_itrace_ref(ip1);
 
 	/*
 	 * Unlock dp1 and lock dp2 if they are different.
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 18a85e7..47c45ff 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,28 +40,6 @@
 #include "xfs_itable.h"
 #include "xfs_utils.h"
 
-/*
- * xfs_get_dir_entry is used to get a reference to an inode given
- * its parent directory inode and the name of the file.	 It does
- * not lock the child inode, and it unlocks the directory before
- * returning.  The directory's generation number is returned for
- * use by a later call to xfs_lock_dir_and_entry.
- */
-int
-xfs_get_dir_entry(
-	bhv_vname_t	*dentry,
-	xfs_inode_t	**ipp)
-{
-	bhv_vnode_t	*vp;
-
-	vp = VNAME_TO_VNODE(dentry);
-
-	*ipp = xfs_vtoi(vp);
-	if (!*ipp)
-		return XFS_ERROR(ENOENT);
-	VN_HOLD(vp);
-	return 0;
-}
 
 int
 xfs_dir_lookup_int(
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcc..c4c4a6a 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,7 +21,6 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
 extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
 extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 5390d12..4765e7c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2270,41 +2270,30 @@ xfs_remove(
 	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t             *ip;
+	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
+	int			namelen = VNAMELEN(dentry);
 	xfs_trans_t             *tp = NULL;
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	int			dm_di_mode = 0;
 	int			link_zero;
 	uint			resblks;
-	int			namelen;
 
 	xfs_itrace_entry(dp);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	namelen = VNAMELEN(dentry);
-
-	if (!xfs_get_dir_entry(dentry, &ip)) {
-	        dm_di_mode = ip->i_d.di_mode;
-		IRELE(ip);
-	}
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode, 0, 0);
+					name, NULL, ip->i_d.di_mode, 0, 0);
 		if (error)
 			return error;
 	}
 
-	/* From this point on, return through std_return */
-	ip = NULL;
-
 	/*
 	 * We need to get a reference to ip before we get our log
 	 * reservation. The reason for this is that we cannot call
@@ -2317,13 +2306,7 @@ xfs_remove(
 	 * when we call xfs_iget.  Instead we get an unlocked reference
 	 * to the inode before getting our log reservation.
 	 */
-	error = xfs_get_dir_entry(dentry, &ip);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto std_return;
-	}
-
-	dm_di_mode = ip->i_d.di_mode;
+	IHOLD(ip);
 
 	xfs_itrace_entry(ip);
 	xfs_itrace_ref(ip);
@@ -2459,7 +2442,7 @@ xfs_remove(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 				dir_vp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
-				name, NULL, dm_di_mode, error, 0);
+				name, NULL, ip->i_d.di_mode, error, 0);
 	}
 	return error;
 
@@ -2868,14 +2851,13 @@ xfs_rmdir(
 	char			*name = VNAME(dentry);
 	int			namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-  	xfs_inode_t             *cdp;   /* child directory */
+  	xfs_inode_t             *cdp = VNAME_TO_INODE(dentry);
 	xfs_trans_t             *tp;
 	int                     error;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	int			dm_di_mode = S_IFDIR;
 	int			last_cdp_link;
 	uint			resblks;
 
@@ -2884,24 +2866,15 @@ xfs_rmdir(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (!xfs_get_dir_entry(dentry, &cdp)) {
-	        dm_di_mode = cdp->i_d.di_mode;
-		IRELE(cdp);
-	}
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
 					dir_vp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode, 0, 0);
+					name, NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
 			return XFS_ERROR(error);
 	}
 
-	/* Return through std_return after this point. */
-
-	cdp = NULL;
-
 	/*
 	 * We need to get a reference to cdp before we get our log
 	 * reservation.  The reason for this is that we cannot call
@@ -2914,13 +2887,7 @@ xfs_rmdir(
 	 * when we call xfs_iget.  Instead we get an unlocked reference
 	 * to the inode before getting our log reservation.
 	 */
-	error = xfs_get_dir_entry(dentry, &cdp);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto std_return;
-	}
-	mp = dp->i_mount;
-	dm_di_mode = cdp->i_d.di_mode;
+	IHOLD(cdp);
 
 	/*
 	 * Get the dquots for the inodes.
@@ -3077,7 +3044,7 @@ xfs_rmdir(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 					dir_vp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode,
+					name, NULL, cdp->i_d.di_mode,
 					error, 0);
 	}
 	return error;
-- 
cgit v0.10.2


From 126468b1156211e26d97f74b2f1767acd141005a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:57 +1100
Subject: [XFS] kill xfs_rwlock/xfs_rwunlock

We can just use xfs_ilock/xfs_iunlock instead and get rid of the ugly
bhv_vrwlock_t.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30533a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e051952..169e6c0 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1532,9 +1532,9 @@ xfs_vm_bmap(
 	struct xfs_inode	*ip = XFS_I(inode);
 
 	xfs_itrace_entry(XFS_I(inode));
-	xfs_rwlock(ip, VRWLOCK_READ);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
-	xfs_rwunlock(ip, VRWLOCK_READ);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1663533..3c20007 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -228,11 +228,11 @@ xfs_read(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+		int iolock = XFS_IOLOCK_SHARED;
 
 		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
-					dmflags, &locktype);
+					dmflags, &iolock);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			if (unlikely(ioflags & IO_ISDIRECT))
@@ -287,11 +287,11 @@ xfs_splice_read(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_READ;
+		int iolock = XFS_IOLOCK_SHARED;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
-					FILP_DELAY_FLAG(infilp), &locktype);
+					FILP_DELAY_FLAG(infilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			return -error;
@@ -330,11 +330,11 @@ xfs_splice_write(
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_WRITE;
+		int iolock = XFS_IOLOCK_EXCL;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
-					FILP_DELAY_FLAG(outfilp), &locktype);
+					FILP_DELAY_FLAG(outfilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return -error;
@@ -580,7 +580,6 @@ xfs_write(
 	xfs_fsize_t		isize, new_size;
 	int			iolock;
 	int			eventsent = 0;
-	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
 	int			need_i_mutex;
@@ -607,11 +606,9 @@ xfs_write(
 relock:
 	if (ioflags & IO_ISDIRECT) {
 		iolock = XFS_IOLOCK_SHARED;
-		locktype = VRWLOCK_WRITE_DIRECT;
 		need_i_mutex = 0;
 	} else {
 		iolock = XFS_IOLOCK_EXCL;
-		locktype = VRWLOCK_WRITE;
 		need_i_mutex = 1;
 		mutex_lock(&inode->i_mutex);
 	}
@@ -635,8 +632,7 @@ start:
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
-				      pos, count,
-				      dmflags, &locktype);
+				      pos, count, dmflags, &iolock);
 		if (error) {
 			goto out_unlock_internal;
 		}
@@ -667,7 +663,6 @@ start:
 		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
-			locktype = VRWLOCK_WRITE;
 			need_i_mutex = 1;
 			mutex_lock(&inode->i_mutex);
 			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -744,7 +739,6 @@ retry:
 			mutex_unlock(&inode->i_mutex);
 
 			iolock = XFS_IOLOCK_SHARED;
-			locktype = VRWLOCK_WRITE_DIRECT;
 			need_i_mutex = 0;
 		}
 
@@ -781,7 +775,7 @@ retry:
 
 	if (ret == -ENOSPC &&
 	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-		xfs_rwunlock(xip, locktype);
+		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
@@ -789,7 +783,7 @@ retry:
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
-		xfs_rwlock(xip, locktype);
+		xfs_ilock(xip, iolock);
 		if (error)
 			goto out_unlock_internal;
 		pos = xip->i_size;
@@ -817,7 +811,8 @@ retry:
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
 		int error2;
-		xfs_rwunlock(xip, locktype);
+
+		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +820,7 @@ retry:
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
-		xfs_rwlock(xip, locktype);
+		xfs_ilock(xip, iolock);
 		error2 = xfs_write_sync_logforce(mp, xip);
 		if (!error)
 			error = error2;
@@ -846,7 +841,7 @@ retry:
 			xip->i_d.di_size = xip->i_size;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
-	xfs_rwunlock(xip, locktype);
+	xfs_iunlock(xip, iolock);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 2022318..4ed5914 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -46,18 +46,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
 }
 
 /*
- * Values for the vop_rwlock/rwunlock flags parameter.
- */
-typedef enum bhv_vrwlock {
-	VRWLOCK_NONE,
-	VRWLOCK_READ,
-	VRWLOCK_WRITE,
-	VRWLOCK_WRITE_DIRECT,
-	VRWLOCK_TRY_READ,
-	VRWLOCK_TRY_WRITE
-} bhv_vrwlock_t;
-
-/*
  * Return values for xfs_inactive.  A return value of
  * VN_INACTIVE_NOCACHE implies that the file system behavior
  * has disassociated its state and bhv_desc_t from the vnode.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8a472..110ee83 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -67,7 +67,7 @@ struct xfs_mru_cache;
  */
 
 typedef int	(*xfs_send_data_t)(int, bhv_vnode_t *,
-			xfs_off_t, size_t, int, bhv_vrwlock_t *);
+			xfs_off_t, size_t, int, int *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
 typedef int	(*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4765e7c..811ee87 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3375,47 +3375,6 @@ std_return:
 }
 
 int
-xfs_rwlock(
-	xfs_inode_t	*ip,
-	bhv_vrwlock_t	locktype)
-{
-	if (S_ISDIR(ip->i_d.di_mode))
-		return 1;
-	if (locktype == VRWLOCK_WRITE) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	} else if (locktype == VRWLOCK_TRY_READ) {
-		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
-	} else if (locktype == VRWLOCK_TRY_WRITE) {
-		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
-	} else {
-		ASSERT((locktype == VRWLOCK_READ) ||
-		       (locktype == VRWLOCK_WRITE_DIRECT));
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	return 1;
-}
-
-
-void
-xfs_rwunlock(
-	xfs_inode_t     *ip,
-	bhv_vrwlock_t	locktype)
-{
- 	if (S_ISDIR(ip->i_d.di_mode))
-  		return;
-	if (locktype == VRWLOCK_WRITE) {
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	} else {
-		ASSERT((locktype == VRWLOCK_READ) ||
-		       (locktype == VRWLOCK_WRITE_DIRECT));
-		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	}
-	return;
-}
-
-
-int
 xfs_inode_flush(
 	xfs_inode_t	*ip,
 	int		flags)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0..85340ba 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,8 +38,6 @@ int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
 		char *target_path, mode_t mode, bhv_vnode_t **vpp,
 		struct cred *credp);
-int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
-void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
-- 
cgit v0.10.2


From 24bd861d1c3fff5248de7ba3bdddb3369087ad46 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:45:16 +1100
Subject: [XFS] don't encode parent in nfs filehandles unless nessecary

As Dave pointed out after the export ops changes we now always encode the
parent into the filehandle for regular files, but it's not actually needed
when the filesystem is export with no_subtree_check. This one-liner fixes
xfs_fs_encode_fh to skip encoding the parent unless nessecary.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30535a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c..21f0e82 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -66,7 +66,7 @@ xfs_fs_encode_fh(
 	int			len;
 
 	/* Directories don't need their parent encoded, they have ".." */
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) || !connectable)
 		fileid_type = FILEID_INO32_GEN;
 	else
 		fileid_type = FILEID_INO32_GEN_PARENT;
-- 
cgit v0.10.2


From 44d814ced4cffbfe6a775c5bb8b941a6e734e7d9 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:29 +1100
Subject: [XFS] Update c/mtime correctly on truncates

XFS changes the c/mtime of an inode when truncating it to the same size.
The c/mtime is only supposed to change if the size is changed. Not to be
confused with ftruncate, where the c/mtime is supposed to be changed even
if the size is not changed.

The Linux VFS encodes this semantic difference in the flags it sends down
to ->setattr, which XFS currently ignores. We need to make XFS pay
attention to the VFS flags and hence Do The Right Thing.

SGI-PV: 977547
SGI-Modid: xfs-linux-melb:xfs-kern:30536a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 811ee87..b77dede 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -633,6 +633,15 @@ xfs_setattr(
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
 	if (mask & XFS_AT_SIZE) {
+		/*
+		 * Only change the c/mtime if we are changing the size
+		 * or we are explicitly asked to change it. This handles
+		 * the semantic difference between truncate() and ftruncate()
+		 * as implemented in the VFS.
+		 */
+		if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+			timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
 		if (vap->va_size > ip->i_size) {
 			xfs_igrow_finish(tp, ip, vap->va_size,
 			    !(flags & ATTR_DMI));
@@ -661,10 +670,6 @@ xfs_setattr(
 			 */
 			xfs_iflags_set(ip, XFS_ITRUNCATED);
 		}
-		/*
-		 * Have to do this even if the file's size doesn't change.
-		 */
-		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 	}
 
 	/*
-- 
cgit v0.10.2


From 6ee4752ffe782be6e86bea1403a2fe0f682aa71a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:35 +1100
Subject: [XFS] Use atomic counters for ktrace buffer indexes

ktrace_enter() is consuming vast amounts of CPU time due to the use of a
single global lock for protecting buffer index increments. Change it to
use per-buffer atomic counters - this reduces ktrace_enter() overhead
during a trace intensive test on a 4p machine from 58% of all CPU time to
12% and halves test runtime.

SGI-PV: 977546
SGI-Modid: xfs-linux-melb:xfs-kern:30537a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067c..4e0444c 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -92,7 +92,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 
 	ktp->kt_entries  = ktep;
 	ktp->kt_nentries = nentries;
-	ktp->kt_index    = 0;
+	atomic_set(&ktp->kt_index, 0);
 	ktp->kt_rollover = 0;
 	return ktp;
 }
@@ -151,8 +151,6 @@ ktrace_enter(
 	void            *val14,
 	void            *val15)
 {
-	static DEFINE_SPINLOCK(wrap_lock);
-	unsigned long	flags;
 	int             index;
 	ktrace_entry_t  *ktep;
 
@@ -161,12 +159,8 @@ ktrace_enter(
 	/*
 	 * Grab an entry by pushing the index up to the next one.
 	 */
-	spin_lock_irqsave(&wrap_lock, flags);
-	index = ktp->kt_index;
-	if (++ktp->kt_index == ktp->kt_nentries)
-		ktp->kt_index = 0;
-	spin_unlock_irqrestore(&wrap_lock, flags);
-
+	index = atomic_add_return(1, &ktp->kt_index);
+	index = (index - 1) % ktp->kt_nentries;
 	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
 		ktp->kt_rollover = 1;
 
@@ -199,11 +193,12 @@ int
 ktrace_nentries(
 	ktrace_t        *ktp)
 {
-	if (ktp == NULL) {
+	int	index;
+	if (ktp == NULL)
 		return 0;
-	}
 
-	return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+	index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+	return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 
 /*
@@ -228,7 +223,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
 	int             nentries;
 
 	if (ktp->kt_rollover)
-		index = ktp->kt_index;
+		index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
 	else
 		index = 0;
 
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b4..782dbbb 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,7 @@ typedef struct ktrace_entry {
  */
 typedef struct ktrace {
 	int		kt_nentries;	/* number of entries in trace buf */
-	int		kt_index;	/* current index in entries */
+	atomic_t	kt_index;	/* current index in entries */
 	int		kt_rollover;
 	ktrace_entry_t	*kt_entries;	/* buffer of entries */
 } ktrace_t;
-- 
cgit v0.10.2


From d234154125197053d5215711b5df867979e55ebd Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:43 +1100
Subject: [XFS] Use power-of-2 sized buffers to reduce overhead

Now that the ktrace_enter() code is using atomics, the non-power-of-2
buffer sizes - which require modulus operations to get the index - are
showing up as using substantial CPU in the profiles.

Force the buffer sizes to be rounded up to the nearest power of two and
use masking rather than modulus operations to convert the index counter to
the buffer index. This reduces ktrace_enter overhead to 8% of a CPU time,
and again almost halves the trace intensive test runtime.

SGI-PV: 977546
SGI-Modid: xfs-linux-melb:xfs-kern:30538a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 4e0444c..0b75d30 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int          ktrace_zentries;
 void __init
 ktrace_init(int zentries)
 {
-	ktrace_zentries = zentries;
+	ktrace_zentries = roundup_pow_of_two(zentries);
 
 	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
 					"ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
  * ktrace_alloc()
  *
  * Allocate a ktrace header and enough buffering for the given
- * number of entries.
+ * number of entries. Round the number of entries up to a
+ * power of 2 so we can do fast masking to get the index from
+ * the atomic index counter.
  */
 ktrace_t *
 ktrace_alloc(int nentries, unsigned int __nocast sleep)
 {
 	ktrace_t        *ktp;
 	ktrace_entry_t  *ktep;
+	int		entries;
 
 	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
 
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 	/*
 	 * Special treatment for buffers with the ktrace_zentries entries
 	 */
-	if (nentries == ktrace_zentries) {
+	entries = roundup_pow_of_two(nentries);
+	if (entries == ktrace_zentries) {
 		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
 							    sleep);
 	} else {
-		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+		ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
 							    sleep | KM_LARGE);
 	}
 
@@ -91,7 +95,9 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 	}
 
 	ktp->kt_entries  = ktep;
-	ktp->kt_nentries = nentries;
+	ktp->kt_nentries = entries;
+	ASSERT(is_power_of_2(entries));
+	ktp->kt_index_mask = entries - 1;
 	atomic_set(&ktp->kt_index, 0);
 	ktp->kt_rollover = 0;
 	return ktp;
@@ -160,7 +166,7 @@ ktrace_enter(
 	 * Grab an entry by pushing the index up to the next one.
 	 */
 	index = atomic_add_return(1, &ktp->kt_index);
-	index = (index - 1) % ktp->kt_nentries;
+	index = (index - 1) & ktp->kt_index_mask;
 	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
 		ktp->kt_rollover = 1;
 
@@ -197,7 +203,7 @@ ktrace_nentries(
 	if (ktp == NULL)
 		return 0;
 
-	index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+	index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
 	return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 
@@ -223,7 +229,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
 	int             nentries;
 
 	if (ktp->kt_rollover)
-		index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+		index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
 	else
 		index = 0;
 
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 782dbbb..741d694 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -31,6 +31,7 @@ typedef struct ktrace_entry {
 typedef struct ktrace {
 	int		kt_nentries;	/* number of entries in trace buf */
 	atomic_t	kt_index;	/* current index in entries */
+	unsigned int	kt_index_mask;
 	int		kt_rollover;
 	ktrace_entry_t	*kt_entries;	/* buffer of entries */
 } ktrace_t;
-- 
cgit v0.10.2


From bc4ac74a4e5bd7db02976eb1b681e1d11f81c9ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:45:58 +1100
Subject: [XFS] cleanup vnode use in dmapi calls

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30545a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ff..0590524 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
 	struct inode	*inode)
 {
 	struct xfs_mount *mp = XFS_M(inode->i_sb);
+	struct xfs_inode *ip = XFS_I(inode);
 
-	if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) {
-		if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) {
-			bhv_vnode_t *vp = vn_from_inode(inode);
-
-			return -XFS_SEND_DATA(mp, DM_EVENT_READ,
-						vp, 0, 0, 0, NULL);
-		}
-	}
-
+	if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
+	             DM_EVENT_ENABLED(ip, DM_EVENT_READ))
+		return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
 	return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 3c20007..01a8f26 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -231,7 +231,7 @@ xfs_read(
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 		int iolock = XFS_IOLOCK_SHARED;
 
-		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
+		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
 					dmflags, &iolock);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -276,7 +276,6 @@ xfs_splice_read(
 	int			flags,
 	int			ioflags)
 {
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 
@@ -290,7 +289,7 @@ xfs_splice_read(
 		int iolock = XFS_IOLOCK_SHARED;
 		int error;
 
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
+		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
 					FILP_DELAY_FLAG(infilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -317,7 +316,6 @@ xfs_splice_write(
 	int			flags,
 	int			ioflags)
 {
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
@@ -333,7 +331,7 @@ xfs_splice_write(
 		int iolock = XFS_IOLOCK_EXCL;
 		int error;
 
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
 					FILP_DELAY_FLAG(outfilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -631,7 +629,7 @@ start:
 			dmflags |= DM_FLAGS_IMUX;
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
 				      pos, count, dmflags, &iolock);
 		if (error) {
 			goto out_unlock_internal;
@@ -778,8 +776,8 @@ retry:
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
-				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
+				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 87f6467..19aae13 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5811,7 +5811,7 @@ xfs_getbmap(
 	if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
 	    whichfork == XFS_DATA_FORK) {
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
 		if (error)
 			return XFS_ERROR(error);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 110ee83..7b37fa0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
  * Prototypes and functions for the Data Migration subsystem.
  */
 
-typedef int	(*xfs_send_data_t)(int, bhv_vnode_t *,
+typedef int	(*xfs_send_data_t)(int, struct xfs_inode *,
 			xfs_off_t, size_t, int, int *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int	(*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
+typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-			bhv_vnode_t *,
-			dm_right_t, bhv_vnode_t *, dm_right_t,
+			struct xfs_inode *, dm_right_t,
+			struct xfs_inode *, dm_right_t,
 			char *, char *, mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
-typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *,
+typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
 			dm_right_t, mode_t, int, int);
 
 typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
 	xfs_send_unmount_t	xfs_send_unmount;
 } xfs_dmops_t;
 
-#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
-	(*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock)
+#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
+	(*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
 	(*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, vp,right) \
-	(*(mp)->m_dm_ops->xfs_send_destroy)(vp,right)
+#define XFS_SEND_DESTROY(mp, ip,right) \
+	(*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
 	(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
 	(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
 	(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \
-	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl)
+#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
+	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
 
 
 /*
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fd1244c..6f80cfd 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -258,8 +258,8 @@ xfs_rename(
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-					src_dir_vp, DM_RIGHT_NULL,
-					target_dir_vp, DM_RIGHT_NULL,
+					src_dp, DM_RIGHT_NULL,
+					target_dp, DM_RIGHT_NULL,
 					src_name, target_name,
 					0, 0, 0);
 		if (error) {
@@ -591,8 +591,8 @@ std_return:
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
 		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-					src_dir_vp, DM_RIGHT_NULL,
-					target_dir_vp, DM_RIGHT_NULL,
+					src_dp, DM_RIGHT_NULL,
+					target_dp, DM_RIGHT_NULL,
 					src_name, target_name,
 					0, error, 0);
 	}
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 3ec27bf..4c132a8 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -566,7 +566,7 @@ xfs_unmount(
 #ifdef HAVE_DMAPI
 	if (mp->m_flags & XFS_MOUNT_DMAPI) {
 		error = XFS_SEND_PREUNMOUNT(mp,
-				rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+				rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
 				NULL, NULL, 0, 0,
 				(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
 					0:DM_FLAGS_UNWANTED);
@@ -617,7 +617,7 @@ out:
 		/* Note: mp structure must still exist for
 		 * XFS_SEND_UNMOUNT() call.
 		 */
-		XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL,
+		XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
 			DM_RIGHT_NULL, 0, error, unmount_event_flags);
 	}
 	if (xfs_unmountfs_needed) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b77dede..7e124b5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -326,7 +326,7 @@ xfs_setattr(
 		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 		    !(flags & ATTR_DMI)) {
 			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
 				vap->va_size, 0, dmflags, NULL);
 			if (code) {
 				lock_flags = 0;
@@ -881,7 +881,7 @@ xfs_setattr(
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 	    !(flags & ATTR_DMI)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL, NULL, NULL,
 					0, 0, AT_DELAY_FLAG(flags));
 	}
@@ -1586,9 +1586,8 @@ xfs_inactive(
 
 	mp = ip->i_mount;
 
-	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
-		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
-	}
+	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
+		XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
 
 	error = 0;
 
@@ -1820,7 +1819,7 @@ xfs_create(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-				dir_vp, DM_RIGHT_NULL, NULL,
+				dp, DM_RIGHT_NULL, NULL,
 				DM_RIGHT_NULL, name, NULL,
 				mode, 0, 0);
 
@@ -1976,8 +1975,8 @@ std_return:
 	if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-			dir_vp, DM_RIGHT_NULL,
-			*vpp ? vp:NULL,
+			dp, DM_RIGHT_NULL,
+			*vpp ? ip : NULL,
 			DM_RIGHT_NULL, name, NULL,
 			mode, error, 0);
 	}
@@ -2272,7 +2271,6 @@ xfs_remove(
 	xfs_inode_t             *dp,
 	bhv_vname_t		*dentry)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
@@ -2292,7 +2290,7 @@ xfs_remove(
 		return XFS_ERROR(EIO);
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
 					name, NULL, ip->i_d.di_mode, 0, 0);
 		if (error)
@@ -2445,7 +2443,7 @@ xfs_remove(
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-				dir_vp, DM_RIGHT_NULL,
+				dp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
 				name, NULL, ip->i_d.di_mode, error, 0);
 	}
@@ -2504,8 +2502,8 @@ xfs_link(
 
 	if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-					target_dir_vp, DM_RIGHT_NULL,
-					src_vp, DM_RIGHT_NULL,
+					tdp, DM_RIGHT_NULL,
+					sip, DM_RIGHT_NULL,
 					target_name, NULL, 0, 0, 0);
 		if (error)
 			return error;
@@ -2615,8 +2613,8 @@ xfs_link(
 std_return:
 	if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-				target_dir_vp, DM_RIGHT_NULL,
-				src_vp, DM_RIGHT_NULL,
+				tdp, DM_RIGHT_NULL,
+				sip, DM_RIGHT_NULL,
 				target_name, NULL, 0, error, 0);
 	}
 	return error;
@@ -2665,7 +2663,7 @@ xfs_mkdir(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-					dir_vp, DM_RIGHT_NULL, NULL,
+					dp, DM_RIGHT_NULL, NULL,
 					DM_RIGHT_NULL, dir_name, NULL,
 					mode, 0, 0);
 		if (error)
@@ -2823,8 +2821,8 @@ std_return:
 	if ((created || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-					dir_vp, DM_RIGHT_NULL,
-					created ? XFS_ITOV(cdp):NULL,
+					dp, DM_RIGHT_NULL,
+					created ? cdp : NULL,
 					DM_RIGHT_NULL,
 					dir_name, NULL,
 					mode, error, 0);
@@ -2873,7 +2871,7 @@ xfs_rmdir(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-					dir_vp, DM_RIGHT_NULL,
+					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
 					name, NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
@@ -3047,7 +3045,7 @@ xfs_rmdir(
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-					dir_vp, DM_RIGHT_NULL,
+					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
 					name, NULL, cdp->i_d.di_mode,
 					error, 0);
@@ -3144,7 +3142,7 @@ xfs_symlink(
 	}
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
 					link_name, target_path, 0, 0, 0);
 		if (error)
@@ -3348,8 +3346,8 @@ xfs_symlink(
 std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-					dir_vp, DM_RIGHT_NULL,
-					error ? NULL : XFS_ITOV(ip),
+					dp, DM_RIGHT_NULL,
+					error ? NULL : ip,
 					DM_RIGHT_NULL, link_name, target_path,
 					0, error, 0);
 	}
@@ -3707,9 +3705,8 @@ xfs_alloc_file_space(
 		end_dmi_offset = offset+len;
 		if (end_dmi_offset > ip->i_size)
 			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
-			offset, end_dmi_offset - offset,
-			0, NULL);
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
+				      end_dmi_offset - offset, 0, NULL);
 		if (error)
 			return error;
 	}
@@ -3818,8 +3815,8 @@ dmapi_enospc_check:
 	if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-				XFS_ITOV(ip), DM_RIGHT_NULL,
-				XFS_ITOV(ip), DM_RIGHT_NULL,
+				ip, DM_RIGHT_NULL,
+				ip, DM_RIGHT_NULL,
 				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
 		if (error == 0)
 			goto retry;	/* Maybe DMAPI app. has made space */
@@ -3964,7 +3961,7 @@ xfs_free_file_space(
 	    DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
 		if (end_dmi_offset > ip->i_size)
 			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
 				offset, end_dmi_offset - offset,
 				AT_DELAY_FLAG(attr_flags), NULL);
 		if (error)
-- 
cgit v0.10.2


From 979ebab11623894528d4d37b947533ea4e8649d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:05 +1100
Subject: [XFS] cleanup vnode use in xfs_create/mknod/mkdir

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30546a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 3467011..62899a1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -273,7 +273,7 @@ xfs_vn_mknod(
 	dev_t		rdev)
 {
 	struct inode	*inode;
-	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
+	struct xfs_inode *ip = NULL;
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
@@ -285,11 +285,11 @@ xfs_vn_mknod(
 	if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
 		return -EINVAL;
 
-	if (test_default_acl && test_default_acl(dvp)) {
+	if (test_default_acl && test_default_acl(dir)) {
 		if (!_ACL_ALLOC(default_acl)) {
 			return -ENOMEM;
 		}
-		if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+		if (!_ACL_GET_DEFAULT(dir, default_acl)) {
 			_ACL_FREE(default_acl);
 			default_acl = NULL;
 		}
@@ -305,10 +305,10 @@ xfs_vn_mknod(
 	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
-		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
+		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &ip, NULL);
 		break;
 	case S_IFDIR:
-		error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL);
+		error = xfs_mkdir(XFS_I(dir), dentry, mode, &ip, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -318,19 +318,20 @@ xfs_vn_mknod(
 	if (unlikely(error))
 		goto out_free_acl;
 
-	error = xfs_init_security(vp, dir);
+	inode = ip->i_vnode;
+
+	error = xfs_init_security(inode, dir);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
 	if (default_acl) {
-		error = _ACL_INHERIT(vp, mode, default_acl);
+		error = _ACL_INHERIT(inode, mode, default_acl);
 		if (unlikely(error))
 			goto out_cleanup_inode;
-		xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
+		xfs_iflags_set(ip, XFS_IMODIFIED);
 		_ACL_FREE(default_acl);
 	}
 
-	inode = vn_to_inode(vp);
 
 	if (S_ISDIR(mode))
 		xfs_validate_fields(inode);
@@ -339,7 +340,7 @@ xfs_vn_mknod(
 	return -error;
 
  out_cleanup_inode:
-	xfs_cleanup_inode(dir, vp, dentry, mode);
+	xfs_cleanup_inode(dir, inode, dentry, mode);
  out_free_acl:
 	if (default_acl)
 		_ACL_FREE(default_acl);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7e124b5..a42d7fe 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1791,14 +1791,12 @@ xfs_create(
 	bhv_vname_t		*dentry,
 	mode_t			mode,
 	xfs_dev_t		rdev,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
 	xfs_mount_t	        *mp = dp->i_mount;
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	xfs_inode_t		*ip;
-	bhv_vnode_t	        *vp = NULL;
 	xfs_trans_t		*tp;
 	int                     error;
 	xfs_bmap_free_t		free_list;
@@ -1812,7 +1810,7 @@ xfs_create(
 	uint			resblks;
 	int			namelen;
 
-	ASSERT(!*vpp);
+	ASSERT(!*ipp);
 	xfs_itrace_entry(dp);
 
 	namelen = VNAMELEN(dentry);
@@ -1911,7 +1909,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -1949,7 +1947,6 @@ xfs_create(
 	 * vnode to the caller, we bump the vnode ref count now.
 	 */
 	IHOLD(ip);
-	vp = XFS_ITOV(ip);
 
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error) {
@@ -1967,16 +1964,16 @@ xfs_create(
 	XFS_QM_DQRELE(mp, udqp);
 	XFS_QM_DQRELE(mp, gdqp);
 
-	*vpp = vp;
+	*ipp = ip;
 
 	/* Fallthrough to std_return with error = 0  */
 
 std_return:
-	if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
+	if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
 			dp, DM_RIGHT_NULL,
-			*vpp ? ip : NULL,
+			*ipp ? ip : NULL,
 			DM_RIGHT_NULL, name, NULL,
 			mode, error, 0);
 	}
@@ -2634,15 +2631,13 @@ xfs_mkdir(
 	xfs_inode_t             *dp,
 	bhv_vname_t		*dentry,
 	mode_t			mode,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*dir_name = VNAME(dentry);
 	int			dir_namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*cdp;	/* inode of created dir */
-	bhv_vnode_t		*cvp;	/* vnode of created dir */
 	xfs_trans_t		*tp;
 	int			cancel_flags;
 	int			error;
@@ -2749,7 +2744,7 @@ xfs_mkdir(
 	 * from here on will result in the transaction cancel
 	 * unlocking dp so don't do it explicitly in the error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -2780,11 +2775,9 @@ xfs_mkdir(
 	if (error)
 		goto error2;
 
-	cvp = XFS_ITOV(cdp);
-
 	created = B_TRUE;
 
-	*vpp = cvp;
+	*ipp = cdp;
 	IHOLD(cdp);
 
 	/*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 85340ba..0acef12 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -26,12 +26,12 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
 		bhv_vnode_t **vpp);
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
-		xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
 int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
 		bhv_vname_t *dentry);
 int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
-		mode_t mode, bhv_vnode_t **vpp, struct cred *credp);
+		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
 int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
-- 
cgit v0.10.2


From a3da789640871c897901c5f766e33be78d56f35a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:12 +1100
Subject: [XFS] cleanup vnode use in xfs_link

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30547a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 62899a1..1df4820 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -395,23 +395,22 @@ xfs_vn_link(
 	struct inode	*dir,
 	struct dentry	*dentry)
 {
-	struct inode	*ip;	/* inode of guy being linked to */
-	bhv_vnode_t	*vp;	/* vp of name being linked */
+	struct inode	*inode;	/* inode of guy being linked to */
 	int		error;
 
-	ip = old_dentry->d_inode;	/* inode being linked to */
-	vp = vn_from_inode(ip);
+	inode = old_dentry->d_inode;
 
-	VN_HOLD(vp);
-	error = xfs_link(XFS_I(dir), vp, dentry);
+	igrab(inode);
+	error = xfs_link(XFS_I(dir), XFS_I(inode), dentry);
 	if (unlikely(error)) {
-		VN_RELE(vp);
-	} else {
-		xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-		xfs_validate_fields(ip);
-		d_instantiate(dentry, ip);
+		iput(inode);
+		return -error;
 	}
-	return -error;
+
+	xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+	xfs_validate_fields(inode);
+	d_instantiate(dentry, inode);
+	return 0;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a42d7fe..10d2d22 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2471,12 +2471,10 @@ xfs_remove(
 int
 xfs_link(
 	xfs_inode_t		*tdp,
-	bhv_vnode_t		*src_vp,
+	xfs_inode_t		*sip,
 	bhv_vname_t		*dentry)
 {
-	bhv_vnode_t		*target_dir_vp = XFS_ITOV(tdp);
 	xfs_mount_t		*mp = tdp->i_mount;
-	xfs_inode_t		*sip = xfs_vtoi(src_vp);
 	xfs_trans_t		*tp;
 	xfs_inode_t		*ips[2];
 	int			error;
@@ -2489,10 +2487,10 @@ xfs_link(
 	int			target_namelen;
 
 	xfs_itrace_entry(tdp);
-	xfs_itrace_entry(xfs_vtoi(src_vp));
+	xfs_itrace_entry(sip);
 
 	target_namelen = VNAMELEN(dentry);
-	ASSERT(!VN_ISDIR(src_vp));
+	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -2544,8 +2542,8 @@ xfs_link(
 	 * xfs_trans_cancel will both unlock the inodes and
 	 * decrement the associated ref counts.
 	 */
-	VN_HOLD(src_vp);
-	VN_HOLD(target_dir_vp);
+	IHOLD(sip);
+	IHOLD(tdp);
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 0acef12..79c13f5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -28,7 +28,7 @@ int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
-int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		bhv_vname_t *dentry);
 int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
 		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-- 
cgit v0.10.2


From 3937be5ba836a204d3d1df96b518eecd6cdacbb9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:19 +1100
Subject: [XFS] cleanup vnode use in xfs_symlink and xfs_rename

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30548a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 1df4820..215158c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -437,29 +437,33 @@ xfs_vn_symlink(
 	struct dentry	*dentry,
 	const char	*symname)
 {
-	struct inode	*ip;
-	bhv_vnode_t	*cvp;	/* used to lookup symlink to put in dentry */
+	struct inode	*inode;
+	struct xfs_inode *cip = NULL;
 	int		error;
 	mode_t		mode;
 
-	cvp = NULL;
-
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
 
 	error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
-			    &cvp, NULL);
-	if (likely(!error && cvp)) {
-		error = xfs_init_security(cvp, dir);
-		if (likely(!error)) {
-			ip = vn_to_inode(cvp);
-			d_instantiate(dentry, ip);
-			xfs_validate_fields(dir);
-			xfs_validate_fields(ip);
-		} else {
-			xfs_cleanup_inode(dir, cvp, dentry, 0);
-		}
-	}
+			    &cip, NULL);
+	if (unlikely(error))
+		goto out;
+
+	inode = cip->i_vnode;
+
+	error = xfs_init_security(inode, dir);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	d_instantiate(dentry, inode);
+	xfs_validate_fields(dir);
+	xfs_validate_fields(inode);
+	return 0;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, inode, dentry, 0);
+ out:
 	return -error;
 }
 
@@ -487,12 +491,9 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
-	bhv_vnode_t	*tvp;	/* target directory */
 	int		error;
 
-	tvp = vn_from_inode(ndir);
-
-	error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry);
+	error = xfs_rename(XFS_I(odir), odentry, XFS_I(ndir), ndentry);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 6f80cfd..c4d0bac 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -219,12 +219,11 @@ int
 xfs_rename(
 	xfs_inode_t	*src_dp,
 	bhv_vname_t	*src_vname,
-	bhv_vnode_t	*target_dir_vp,
+	xfs_inode_t	*target_dp,
 	bhv_vname_t	*target_vname)
 {
-	bhv_vnode_t	*src_dir_vp = XFS_ITOV(src_dp);
 	xfs_trans_t	*tp;
-	xfs_inode_t	*target_dp, *src_ip, *target_ip;
+	xfs_inode_t	*src_ip, *target_ip;
 	xfs_mount_t	*mp = src_dp->i_mount;
 	int		new_parent;		/* moving to a new dir */
 	int		src_is_directory;	/* src_name is a directory */
@@ -244,16 +243,7 @@ xfs_rename(
 	int		target_namelen = VNAMELEN(target_vname);
 
 	xfs_itrace_entry(src_dp);
-	xfs_itrace_entry(xfs_vtoi(target_dir_vp));
-
-	/*
-	 * Find the XFS behavior descriptor for the target directory
-	 * vnode since it was not handed to us.
-	 */
-	target_dp = xfs_vtoi(target_dir_vp);
-	if (target_dp == NULL) {
-		return XFS_ERROR(EXDEV);
-	}
+	xfs_itrace_entry(target_dp);
 
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
@@ -360,10 +350,10 @@ xfs_rename(
 	 * them when they unlock the inodes.  Also, we need to be careful
 	 * not to add an inode to the transaction more than once.
 	 */
-	VN_HOLD(src_dir_vp);
+	IHOLD(src_dp);
 	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent) {
-		VN_HOLD(target_dir_vp);
+		IHOLD(target_dp);
 		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	}
 	if ((src_ip != src_dp) && (src_ip != target_dp)) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 10d2d22..fa694dc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3059,10 +3059,9 @@ xfs_symlink(
 	bhv_vname_t		*dentry,
 	char			*target_path,
 	mode_t			mode,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t		*tp;
 	xfs_inode_t		*ip;
@@ -3088,7 +3087,7 @@ xfs_symlink(
 	char			*link_name = VNAME(dentry);
 	int			link_namelen;
 
-	*vpp = NULL;
+	*ipp = NULL;
 	error = 0;
 	ip = NULL;
 	tp = NULL;
@@ -3227,7 +3226,7 @@ xfs_symlink(
 	 * transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -3343,13 +3342,8 @@ std_return:
 					0, error, 0);
 	}
 
-	if (!error) {
-		bhv_vnode_t *vp;
-
-		ASSERT(ip);
-		vp = XFS_ITOV(ip);
-		*vpp = vp;
-	}
+	if (!error)
+		*ipp = ip;
 	return error;
 
  error2:
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 79c13f5..71e9b15 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -36,7 +36,7 @@ int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
-		char *target_path, mode_t mode, bhv_vnode_t **vpp,
+		char *target_path, mode_t mode, struct xfs_inode **ipp,
 		struct cred *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
@@ -45,7 +45,7 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
 		struct cred *credp, int	attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
-		bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname);
+		struct xfs_inode *target_dp, bhv_vname_t *target_vname);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
 		int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-- 
cgit v0.10.2


From ef1f5e7ad38e5414d016983a8cc5a8db7654a61d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:25 +1100
Subject: [XFS] cleanup vnode use in xfs_lookup

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30550a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 21f0e82..66a9a9e 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -213,17 +213,16 @@ xfs_fs_get_parent(
 	struct dentry		*child)
 {
 	int			error;
-	bhv_vnode_t		*cvp;
+	struct xfs_inode	*cip;
 	struct dentry		*parent;
 
-	cvp = NULL;
-	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
+	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cip);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
-	parent = d_alloc_anon(vn_to_inode(cvp));
+	parent = d_alloc_anon(cip->i_vnode);
 	if (unlikely(!parent)) {
-		VN_RELE(cvp);
+		iput(cip->i_vnode);
 		return ERR_PTR(-ENOMEM);
 	}
 	return parent;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 215158c..01d9b3f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -372,13 +372,13 @@ xfs_vn_lookup(
 	struct dentry	*dentry,
 	struct nameidata *nd)
 {
-	bhv_vnode_t	*cvp;
+	struct xfs_inode *cip;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	error = xfs_lookup(XFS_I(dir), dentry, &cvp);
+	error = xfs_lookup(XFS_I(dir), dentry, &cip);
 	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
@@ -386,7 +386,7 @@ xfs_vn_lookup(
 		return NULL;
 	}
 
-	return d_splice_alias(vn_to_inode(cvp), dentry);
+	return d_splice_alias(cip->i_vnode, dentry);
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index fa694dc..3418c94 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1763,7 +1763,7 @@ int
 xfs_lookup(
 	xfs_inode_t		*dp,
 	bhv_vname_t		*dentry,
-	bhv_vnode_t		**vpp)
+	xfs_inode_t		**ipp)
 {
 	xfs_inode_t		*ip;
 	xfs_ino_t		e_inum;
@@ -1778,7 +1778,7 @@ xfs_lookup(
 	lock_mode = xfs_ilock_map_shared(dp);
 	error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
 	if (!error) {
-		*vpp = XFS_ITOV(ip);
+		*ipp = ip;
 		xfs_itrace_ref(ip);
 	}
 	xfs_iunlock_map_shared(dp, lock_mode);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 71e9b15..12e5818 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -24,7 +24,7 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
-		bhv_vnode_t **vpp);
+		struct xfs_inode **ipp);
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
-- 
cgit v0.10.2


From dcf49cc5cfbbc0070ad4307428f8282dc7e04e58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:37 +1100
Subject: [XFS] cleanup vnode use in xfs_lrw.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30551a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 01a8f26..1d95dca 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -176,7 +176,6 @@ xfs_read(
 {
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file->f_mapping->host;
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	size_t			size = 0;
 	ssize_t			ret = 0;
@@ -242,7 +241,7 @@ xfs_read(
 	}
 
 	if (unlikely(ioflags & IO_ISDIRECT)) {
-		if (VN_CACHED(vp))
+		if (inode->i_mapping->nrpages)
 			ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
 						    -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
@@ -571,7 +570,6 @@ xfs_write(
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
 	struct inode		*inode = mapping->host;
-	bhv_vnode_t		*vp = XFS_ITOV(xip);
 	unsigned long		segs = nsegs;
 	xfs_mount_t		*mp;
 	ssize_t			ret = 0, error = 0;
@@ -658,7 +656,7 @@ start:
 			return XFS_ERROR(-EINVAL);
 		}
 
-		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
 			need_i_mutex = 1;
@@ -720,7 +718,7 @@ retry:
 	current->backing_dev_info = mapping->backing_dev_info;
 
 	if ((ioflags & IO_ISDIRECT)) {
-		if (VN_CACHED(vp)) {
+		if (mapping->nrpages) {
 			WARN_ON(need_i_mutex == 0);
 			xfs_inval_cached_trace(xip, pos, -1,
 					(pos & PAGE_CACHE_MASK), -1);
-- 
cgit v0.10.2


From af048193fcfe2650e7ed3b1ab3d48b1ed0efb467 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:43 +1100
Subject: [XFS] cleanup vnode use in xfs_iops.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30552a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 01d9b3f..53f8feb 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp;
+	struct inode	*inode = ip->i_vnode;
 
-	vp = XFS_ITOV_NULL(ip);
-	if (vp) {
-		ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
+	if (inode) {
+		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
 	}
 }
 
@@ -80,11 +79,10 @@ void
 xfs_mark_inode_dirty_sync(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp;
+	struct inode	*inode = ip->i_vnode;
 
-	vp = XFS_ITOV_NULL(ip);
-	if (vp)
-		mark_inode_dirty_sync(vn_to_inode(vp));
+	if (inode)
+		mark_inode_dirty_sync(inode);
 }
 
 /*
@@ -215,26 +213,26 @@ xfs_validate_fields(
  */
 STATIC int
 xfs_init_security(
-	bhv_vnode_t	*vp,
+	struct inode	*inode,
 	struct inode	*dir)
 {
-	struct inode	*ip = vn_to_inode(vp);
+	struct xfs_inode *ip = XFS_I(inode);
 	size_t		length;
 	void		*value;
 	char		*name;
 	int		error;
 
-	error = security_inode_init_security(ip, dir, &name, &value, &length);
+	error = security_inode_init_security(inode, dir, &name,
+					     &value, &length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
 			return 0;
 		return -error;
 	}
 
-	error = xfs_attr_set(XFS_I(ip), name, value,
-			length, ATTR_SECURE);
+	error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
 	if (!error)
-		xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED);
+		xfs_iflags_set(ip, XFS_IMODIFIED);
 
 	kfree(name);
 	kfree(value);
@@ -244,7 +242,7 @@ xfs_init_security(
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
-	bhv_vnode_t	*vp,
+	struct inode	*inode,
 	struct dentry	*dentry,
 	int		mode)
 {
@@ -255,14 +253,14 @@ xfs_cleanup_inode(
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	teardown.d_inode = vn_to_inode(vp);
+	teardown.d_inode = inode;
 	teardown.d_name = dentry->d_name;
 
 	if (S_ISDIR(mode))
 		xfs_rmdir(XFS_I(dir), &teardown);
 	else
 		xfs_remove(XFS_I(dir), &teardown);
-	VN_RELE(vp);
+	iput(inode);
 }
 
 STATIC int
-- 
cgit v0.10.2


From 5f90150abad61b49dbb4a6ca1087fe0a75001ef9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:49 +1100
Subject: [XFS] cleanup vnode use in xfs_bmap.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30553a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 19aae13..bce8e3b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5772,7 +5772,6 @@ xfs_getbmap(
 	int			error;		/* return value */
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
-	bhv_vnode_t		*vp;		/* corresponding vnode */
 	int			lock;		/* lock state */
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -5789,7 +5788,6 @@ xfs_getbmap(
 	int			bmapi_flags;	/* flags for xfs_bmapi */
 	__int32_t		oflags;		/* getbmapx bmv_oflags field */
 
-	vp = XFS_ITOV(ip);
 	mp = ip->i_mount;
 
 	whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-- 
cgit v0.10.2


From dfa18b117974d7667a2d5b941853fac3f2e256db Mon Sep 17 00:00:00 2001
From: Niv Sardi <xaiki@sgi.com>
Date: Thu, 6 Mar 2008 13:49:26 +1100
Subject: [XFS] kill t_sema member of struct xfs_trans

It's completely unused so we might aswell kill it. Note that there is
another t_sema in struct xlog_ticket, which is used and actually an sv_t
despite the name. That one is left untouched by this patch.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30591a

Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628..b5effce 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -341,7 +341,6 @@ typedef struct xfs_trans {
 	unsigned int		t_rtx_res;	/* # of rt extents resvd */
 	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
 	xfs_log_ticket_t	t_ticket;	/* log mgr ticket */
-	sema_t			t_sema;		/* sema for commit completion */
 	xfs_lsn_t		t_lsn;		/* log seq num of start of
 						 * transaction. */
 	xfs_lsn_t		t_commit_lsn;	/* log seq num of end of
-- 
cgit v0.10.2


From a45c796867df8dabc8eed6e72898d7ba1609bd7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:49:36 +1100
Subject: [XFS] Remove superflous xfs_readsb call in xfs_mountfs.

When xfs_mountfs is called by xfs_mount xfs_readsb was called 35 lines
above unconditionally, so there is no need to try to read the superblock
if it's not present. If any other port doesn't have the superblock read at
this point it should just call it directly from it's xfs_mount equivalent.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30603a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8ed164e..41b690e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -964,11 +964,6 @@ xfs_mountfs(
 	int		uuid_mounted = 0;
 	int		error = 0;
 
-	if (mp->m_sb_bp == NULL) {
-		error = xfs_readsb(mp, mfsi_flags);
-		if (error)
-			return error;
-	}
 	xfs_mount_common(mp, sbp);
 
 	/*
-- 
cgit v0.10.2


From 535f6b3735db6ef6026537bfe55ae00c3d9cc1ee Mon Sep 17 00:00:00 2001
From: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Date: Thu, 27 Mar 2008 17:58:27 +1100
Subject: [XFS] Replace custom AIL linked-list code with struct list_head

Replace the xfs_ail_entry_t with a struct list_head and clean the
surrounding code up. Also fixes a livelock in xfs_trans_first_push_ail()
by terminating the loop at the head of the list correctly.

SGI-PV: 978682
SGI-Modid: xfs-linux-melb:xfs-kern:30636a

Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7b37fa0..77b39f6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -220,7 +220,7 @@ extern void	xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
 #endif
 
 typedef struct xfs_ail {
-	xfs_ail_entry_t		xa_ail;
+	struct list_head	xa_ail;
 	uint			xa_gen;
 	struct task_struct	*xa_task;
 	xfs_lsn_t		xa_target;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5effce..0804207 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
 
-typedef struct xfs_ail_entry {
-	struct xfs_log_item	*ail_forw;	/* AIL forw pointer */
-	struct xfs_log_item	*ail_back;	/* AIL back pointer */
-} xfs_ail_entry_t;
-
 typedef struct xfs_log_item {
-	xfs_ail_entry_t			li_ail;		/* AIL pointers */
+	struct list_head		li_ail;		/* AIL pointers */
 	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
 	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
 	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 76d470d..13235ae 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
 
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 #else
 #define	xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
 	xfs_log_item_t	*lip;
 
 	spin_lock(&mp->m_ail_lock);
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	if (lip == NULL) {
 		lsn = (xfs_lsn_t)0;
 	} else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
 {
 	xfs_log_item_t		*lip;
 
-	lip = xfs_ail_min(&mp->m_ail.xa_ail);
+	lip = xfs_ail_min(&mp->m_ail);
 	if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
 		if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
 			xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
 {
 	xfs_log_item_t	*lip;
 
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	*gen = (int)mp->m_ail.xa_gen;
 	if (lsn == 0)
 		return lip;
 
-	while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0))
-		lip = lip->li_ail.ail_forw;
+	list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+			return lip;
+	}
 
-	return lip;
+	return NULL;
 }
 
 /*
@@ -329,7 +331,7 @@ xfs_trans_unlocked_item(
 	 * the call to xfs_log_move_tail() doesn't do anything if there's
 	 * not enough free space to wake people up so we're safe calling it.
 	 */
-	min_lip = xfs_ail_min(&mp->m_ail.xa_ail);
+	min_lip = xfs_ail_min(&mp->m_ail);
 
 	if (min_lip == lip)
 		xfs_log_move_tail(mp, 1);
@@ -357,15 +359,13 @@ xfs_trans_update_ail(
 	xfs_log_item_t	*lip,
 	xfs_lsn_t	lsn) __releases(mp->m_ail_lock)
 {
-	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip=NULL;
 	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
 
-	ailp = &(mp->m_ail.xa_ail);
-	mlip = xfs_ail_min(ailp);
+	mlip = xfs_ail_min(&mp->m_ail);
 
 	if (lip->li_flags & XFS_LI_IN_AIL) {
-		dlip = xfs_ail_delete(ailp, lip);
+		dlip = xfs_ail_delete(&mp->m_ail, lip);
 		ASSERT(dlip == lip);
 	} else {
 		lip->li_flags |= XFS_LI_IN_AIL;
@@ -373,11 +373,11 @@ xfs_trans_update_ail(
 
 	lip->li_lsn = lsn;
 
-	xfs_ail_insert(ailp, lip);
+	xfs_ail_insert(&mp->m_ail, lip);
 	mp->m_ail.xa_gen++;
 
 	if (mlip == dlip) {
-		mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+		mlip = xfs_ail_min(&mp->m_ail);
 		spin_unlock(&mp->m_ail_lock);
 		xfs_log_move_tail(mp, mlip->li_lsn);
 	} else {
@@ -407,14 +407,12 @@ xfs_trans_delete_ail(
 	xfs_mount_t	*mp,
 	xfs_log_item_t	*lip) __releases(mp->m_ail_lock)
 {
-	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip;
 	xfs_log_item_t		*mlip;
 
 	if (lip->li_flags & XFS_LI_IN_AIL) {
-		ailp = &(mp->m_ail.xa_ail);
-		mlip = xfs_ail_min(ailp);
-		dlip = xfs_ail_delete(ailp, lip);
+		mlip = xfs_ail_min(&mp->m_ail);
+		dlip = xfs_ail_delete(&mp->m_ail, lip);
 		ASSERT(dlip == lip);
 
 
@@ -423,7 +421,7 @@ xfs_trans_delete_ail(
 		mp->m_ail.xa_gen++;
 
 		if (mlip == dlip) {
-			mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+			mlip = xfs_ail_min(&mp->m_ail);
 			spin_unlock(&mp->m_ail_lock);
 			xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
 		} else {
@@ -461,7 +459,7 @@ xfs_trans_first_ail(
 {
 	xfs_log_item_t	*lip;
 
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	*gen = (int)mp->m_ail.xa_gen;
 
 	return lip;
@@ -485,9 +483,9 @@ xfs_trans_next_ail(
 
 	ASSERT(mp && lip && gen);
 	if (mp->m_ail.xa_gen == *gen) {
-		nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip);
+		nlip = xfs_ail_next(&mp->m_ail, lip);
 	} else {
-		nlip = xfs_ail_min(&(mp->m_ail).xa_ail);
+		nlip = xfs_ail_min(&mp->m_ail);
 		*gen = (int)mp->m_ail.xa_gen;
 		if (restarts != NULL) {
 			XFS_STATS_INC(xs_push_ail_restarts);
@@ -517,8 +515,7 @@ int
 xfs_trans_ail_init(
 	xfs_mount_t	*mp)
 {
-	mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail;
-	mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
+	INIT_LIST_HEAD(&mp->m_ail.xa_ail);
 	return xfsaild_start(mp);
 }
 
@@ -537,7 +534,7 @@ xfs_trans_ail_destroy(
  */
 STATIC void
 xfs_ail_insert(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
@@ -546,27 +543,22 @@ xfs_ail_insert(
 	/*
 	 * If the list is empty, just insert the item.
 	 */
-	if (base->ail_back == (xfs_log_item_t*)base) {
-		base->ail_forw = lip;
-		base->ail_back = lip;
-		lip->li_ail.ail_forw = (xfs_log_item_t*)base;
-		lip->li_ail.ail_back = (xfs_log_item_t*)base;
+	if (list_empty(&ailp->xa_ail)) {
+		list_add(&lip->li_ail, &ailp->xa_ail);
 		return;
 	}
 
-	next_lip = base->ail_back;
-	while ((next_lip != (xfs_log_item_t*)base) &&
-	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
-		next_lip = next_lip->li_ail.ail_back;
+	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+			break;
 	}
-	ASSERT((next_lip == (xfs_log_item_t*)base) ||
+
+	ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
 	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-	lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
-	lip->li_ail.ail_back = next_lip;
-	next_lip->li_ail.ail_forw = lip;
-	lip->li_ail.ail_forw->li_ail.ail_back = lip;
 
-	xfs_ail_check(base, lip);
+	list_add(&lip->li_ail, &next_lip->li_ail);
+
+	xfs_ail_check(ailp, lip);
 	return;
 }
 
@@ -576,15 +568,13 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
-	xfs_ail_check(base, lip);
-	lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
-	lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
-	lip->li_ail.ail_forw = NULL;
-	lip->li_ail.ail_back = NULL;
+	xfs_ail_check(ailp, lip);
+
+	list_del(&lip->li_ail);
 
 	return lip;
 }
@@ -595,14 +585,13 @@ xfs_ail_delete(
  */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-	xfs_ail_entry_t	*base)
+	xfs_ail_t	*ailp)
 /* ARGSUSED */
 {
-	register xfs_log_item_t *forw = base->ail_forw;
-	if (forw == (xfs_log_item_t*)base) {
+	if (list_empty(&ailp->xa_ail))
 		return NULL;
-	}
-	return forw;
+
+	return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
 }
 
 /*
@@ -612,15 +601,14 @@ xfs_ail_min(
  */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
-	if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+	if (lip->li_ail.next == &ailp->xa_ail)
 		return NULL;
-	}
-	return lip->li_ail.ail_forw;
 
+	return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
 }
 
 #ifdef DEBUG
@@ -629,57 +617,40 @@ xfs_ail_next(
  */
 STATIC void
 xfs_ail_check(
-	xfs_ail_entry_t *base,
+	xfs_ail_t 	*ailp,
 	xfs_log_item_t	*lip)
 {
 	xfs_log_item_t	*prev_lip;
 
-	prev_lip = base->ail_forw;
-	if (prev_lip == (xfs_log_item_t*)base) {
-		/*
-		 * Make sure the pointers are correct when the list
-		 * is empty.
-		 */
-		ASSERT(base->ail_back == (xfs_log_item_t*)base);
+	if (list_empty(&ailp->xa_ail))
 		return;
-	}
 
 	/*
 	 * Check the next and previous entries are valid.
 	 */
 	ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-	prev_lip = lip->li_ail.ail_back;
-	if (prev_lip != (xfs_log_item_t*)base) {
-		ASSERT(prev_lip->li_ail.ail_forw == lip);
+	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-	}
-	prev_lip = lip->li_ail.ail_forw;
-	if (prev_lip != (xfs_log_item_t*)base) {
-		ASSERT(prev_lip->li_ail.ail_back == lip);
+
+	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-	}
 
 
 #ifdef XFS_TRANS_DEBUG
 	/*
-	 * Walk the list checking forward and backward pointers,
-	 * lsn ordering, and that every entry has the XFS_LI_IN_AIL
-	 * flag set. This is really expensive, so only do it when
-	 * specifically debugging the transaction subsystem.
+	 * Walk the list checking lsn ordering, and that every entry has the
+	 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+	 * when specifically debugging the transaction subsystem.
 	 */
-	prev_lip = (xfs_log_item_t*)base;
-	while (lip != (xfs_log_item_t*)base) {
-		if (prev_lip != (xfs_log_item_t*)base) {
-			ASSERT(prev_lip->li_ail.ail_forw == lip);
+	prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+		if (&prev_lip->li_ail != &ailp->xa_ail)
 			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-		}
-		ASSERT(lip->li_ail.ail_back == prev_lip);
 		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
 		prev_lip = lip;
-		lip = lip->li_ail.ail_forw;
 	}
-	ASSERT(lip == (xfs_log_item_t*)base);
-	ASSERT(base->ail_back == prev_lip);
 #endif /* XFS_TRANS_DEBUG */
 }
 #endif /* DEBUG */
-- 
cgit v0.10.2


From 75de2a91c98a6f486f261c1367fe59f5583e15a3 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 27 Mar 2008 18:00:38 +1100
Subject: [XFS] Account for inode cluster alignment in all allocations

At ENOSPC, we can get a filesystem shutdown due to a cancelling a dirty
transaction in xfs_mkdir or xfs_create. This is due to the initial
allocation attempt not taking into account inode alignment and hence we
can prepare the AGF freelist for allocation when it's not actually
possible to do an allocation. This results in inode allocation returning
ENOSPC with a dirty transaction, and hence we shut down the filesystem.

Because the first allocation is an exact allocation attempt, we must tell
the allocator that the alignment does not affect the allocation attempt.
i.e. we will accept any extent alignment as long as the extent starts at
the block we want. Unfortunately, this means that if the longest free
extent is less than the length + alignment necessary for fallback
allocation attempts but is long enough to attempt a non-aligned
allocation, we will modify the free list.

If we then have the exact allocation fail, all other allocation attempts
will also fail due to the alignment constraint being taken into account.
Hence the initial attempt needs to set the "alignment slop" field so that
alignment, while not required, must be taken into account when determining
if there is enough space left in the AG to do the allocation.

That means if the exact allocation fails, we will not dirty the freelist
if there is not enough space available fo a subsequent allocation to
succeed. Hence we get an ENOSPC error back to userspace without shutting
down the filesystem.

SGI-PV: 978886
SGI-Modid: xfs-linux-melb:xfs-kern:30699a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5a146cb..a64dfbd 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
 /*
  * Allocation group level functions.
  */
+static inline int
+xfs_ialloc_cluster_alignment(
+	xfs_alloc_arg_t	*args)
+{
+	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+	    args->mp->m_sb.sb_inoalignmt >=
+	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+		return args->mp->m_sb.sb_inoalignmt;
+	return 1;
+}
 
 /*
  * Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
-		args.alignment = 1;
+
 		/*
-		 * Allow space for the inode btree to split.
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
 		 */
+		args.alignment = 1;
+		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+		/* Allow space for the inode btree to split. */
 		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
 			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
-		} else if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
-			   args.mp->m_sb.sb_inoalignmt >=
-			   XFS_B_TO_FSBT(args.mp,
-			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
-				args.alignment = args.mp->m_sb.sb_inoalignmt;
-		else
-			args.alignment = 1;
+		} else
+			args.alignment = xfs_ialloc_cluster_alignment(&args);
 		/*
 		 * Need to figure out where to allocate the inode blocks.
 		 * Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
 		args.agbno = be32_to_cpu(agi->agi_root);
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
-		if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
-			args.mp->m_sb.sb_inoalignmt >=
-			XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
-				args.alignment = args.mp->m_sb.sb_inoalignmt;
-		else
-			args.alignment = 1;
+		args.alignment = xfs_ialloc_cluster_alignment(&args);
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
-- 
cgit v0.10.2


From 59a33f9f776b051018ec98af95bd9fe8ba9d0f3e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 27 Mar 2008 18:00:45 +1100
Subject: [XFS] Ensure a btree insert returns a valid cursor.

When writing into preallocated regions there is a case where XFS can oops
or hang doing the unwritten extent conversion on I/O completion. It turns
out that the problem is related to the btree cursor being invalid.

When we do an insert into the tree, we may need to split blocks in the
tree. When we only split at the leaf level (i.e. level 0), everything
works just fine. However, if we have a multi-level split in the btreee,
the cursor passed to the insert function is no longer valid once the
insert is complete.

The leaf level split is handled correctly because all the operations at
level 0 are done using the original cursor, hence it is updated correctly.
However, when we need to update the next level up the tree, we don't use
that cursor - we use a cloned cursor that points to the index in the next
level up where we need to do the insert.

Hence if we need to split a second level, the changes to the tree are
reflected in the cloned cursor and not the original cursor. This
clone-and-move-up-a-level-on-split behaviour recurses all the way to the
top of the tree.

The complexity here is that these cloned cursors do not point to the
original index that was inserted - they point to the newly allocated block
(the right block) and the original cursor pointer to that level may still
point to the left block. Hence, without deep examination of the cloned
cursor and buffers, we cannot update the original cursor with the new path
from the cloned cursor.

In these cases the original cursor could be pointing to the wrong block(s)
and hence a subsequent modification to the tree using that cursor will
lead to corruption of the tree.

The crash case occurs when the tree changes height - we insert a new level
in the tree, and the cursor does not have a buffer in it's path for that
level. Hence any attempt to walk back up the cursor to the root block will
result in a null pointer dereference.

To make matters even more complex, the BMAP BT is rooted in an inode, so
we can have a change of height in the btree *without a root split*. That
is, if the root block in the inode is full when we split a leaf node, we
cannot fit the pointer to the new block in the root, so we allocate a new
block, migrate all the ptrs out of the inode into the new block and point
the inode root block at the newly allocated block. This changes the height
of the tree without a root split having occurred and hence invalidates the
path in the original cursor.

The patch below prevents xfs_bmbt_insert() from returning with an invalid
cursor by detecting the cases that invalidate the original cursor and
refresh it by do a lookup into the btree for the original index we were
inserting at.

Note that the INOBT, AGFBNO and AGFCNT btree implementations also have
this bug, but the cursor is currently always destroyed or revalidated
after an insert for those trees. Hence this patch only address the problem
in the BMBT code.

SGI-PV: 979339
SGI-Modid: xfs-linux-melb:xfs-kern:30701a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987..93470b7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
 
 /*
  * Insert the current record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. It appears, however, that some callers assume that the cursor is
+ * always valid. Hence if we do a multi-level split we need to revalidate the
+ * cursor.
+ *
+ * When a split occurs, we will see a new cursor returned. Use that as a
+ * trigger to determine if we need to revalidate the original cursor. If we get
+ * a split, then use the original irec to lookup up the path of the record we
+ * just inserted.
+ *
+ * Note that the fact that the btree root is in the inode means that we can
+ * have the level of the tree change without a "split" occurring at the root
+ * level. What happens is that the root is migrated to an allocated block and
+ * the inode root is pointed to it. This means a single split can change the
+ * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
+ * the level change should be accounted as a split so as to correctly trigger a
+ * revalidation of the old cursor.
  */
 int					/* error */
 xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
 	xfs_fsblock_t	nbno;
 	xfs_btree_cur_t	*ncur;
 	xfs_bmbt_rec_t	nrec;
+	xfs_bmbt_irec_t	oirec;		/* original irec */
 	xfs_btree_cur_t	*pcur;
+	int		splits = 0;
 
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
 	level = 0;
 	nbno = NULLFSBLOCK;
+	oirec = cur->bc_rec.b;
 	xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
 	ncur = NULL;
 	pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
 				&i))) {
 			if (pcur != cur)
 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-			return error;
+			goto error0;
 		}
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+			/* allocating a new root is effectively a split */
+			if (cur->bc_nlevels != pcur->bc_nlevels)
+				splits++;
 			cur->bc_nlevels = pcur->bc_nlevels;
 			cur->bc_private.b.allocated +=
 				pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
 		}
 		if (ncur) {
+			splits++;
 			pcur = ncur;
 			ncur = NULL;
 		}
 	} while (nbno != NULLFSBLOCK);
+
+	if (splits > 1) {
+		/* revalidate the old cursor as we had a multi-level split */
+		error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
+				oirec.br_startblock, oirec.br_blockcount, &i);
+		if (error)
+			goto error0;
+		ASSERT(i == 1);
+	}
+
 	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 	*stat = i;
 	return 0;
-- 
cgit v0.10.2


From f3dcc13f6fa20af1171eac7a537a4b89b1a84849 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 27 Mar 2008 18:00:54 +1100
Subject: [XFS] cleanup root inode handling in xfs_fs_fill_super

- rename rootvp to root for clarify
- remove useless vn_to_inode call
- check is_bad_inode before calling d_alloc_root
- use iput instead of VN_RELE in the error case

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30708a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index cb9ce90..72e55db 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1307,7 +1307,7 @@ xfs_fs_fill_super(
 	void			*data,
 	int			silent)
 {
-	struct inode		*rootvp;
+	struct inode		*root;
 	struct xfs_mount	*mp = NULL;
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
 	int			error;
@@ -1345,19 +1345,18 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	rootvp = igrab(mp->m_rootip->i_vnode);
-	if (!rootvp) {
+	root = igrab(mp->m_rootip->i_vnode);
+	if (!root) {
 		error = ENOENT;
 		goto fail_unmount;
 	}
-
-	sb->s_root = d_alloc_root(vn_to_inode(rootvp));
-	if (!sb->s_root) {
-		error = ENOMEM;
+	if (is_bad_inode(root)) {
+		error = EINVAL;
 		goto fail_vnrele;
 	}
-	if (is_bad_inode(sb->s_root->d_inode)) {
-		error = EINVAL;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		error = ENOMEM;
 		goto fail_vnrele;
 	}
 
@@ -1379,7 +1378,7 @@ fail_vnrele:
 		dput(sb->s_root);
 		sb->s_root = NULL;
 	} else {
-		VN_RELE(rootvp);
+		iput(root);
 	}
 
 fail_unmount:
-- 
cgit v0.10.2


From df26cfe849d8fd767b26fcd4bfebfff67bda9f3a Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 11:44:03 +1000
Subject: [XFS] split xfs_ioc_xattr

The three subcases of xfs_ioc_xattr don't share any semantics and almost
no code, so split it into three separate helpers.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30709a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f34bd01..c6399b2 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1179,85 +1179,85 @@ xfs_ioc_fsgetxattr(
 }
 
 STATIC int
-xfs_ioc_xattr(
+xfs_ioc_fssetxattr(
 	xfs_inode_t		*ip,
 	struct file		*filp,
-	unsigned int		cmd,
 	void			__user *arg)
 {
 	struct fsxattr		fa;
 	struct bhv_vattr	*vattr;
-	int			error = 0;
+	int			error;
 	int			attr_flags;
-	unsigned int		flags;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
 
 	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
 	if (unlikely(!vattr))
 		return -ENOMEM;
 
-	switch (cmd) {
-	case XFS_IOC_FSSETXATTR: {
-		if (copy_from_user(&fa, arg, sizeof(fa))) {
-			error = -EFAULT;
-			break;
-		}
+	attr_flags = 0;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
 
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
+	vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
+	vattr->va_xflags  = fa.fsx_xflags;
+	vattr->va_extsize = fa.fsx_extsize;
+	vattr->va_projid  = fa.fsx_projid;
 
-		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-		vattr->va_xflags  = fa.fsx_xflags;
-		vattr->va_extsize = fa.fsx_extsize;
-		vattr->va_projid  = fa.fsx_projid;
+	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+	if (!error)
+		vn_revalidate(XFS_ITOV(ip));	/* update flags */
+	kfree(vattr);
+	return 0;
+}
 
-		error = xfs_setattr(ip, vattr, attr_flags, NULL);
-		if (likely(!error))
-			vn_revalidate(XFS_ITOV(ip));	/* update flags */
-		error = -error;
-		break;
-	}
+STATIC int
+xfs_ioc_getxflags(
+	xfs_inode_t		*ip,
+	void			__user *arg)
+{
+	unsigned int		flags;
 
-	case XFS_IOC_GETXFLAGS: {
-		flags = xfs_di2lxflags(ip->i_d.di_flags);
-		if (copy_to_user(arg, &flags, sizeof(flags)))
-			error = -EFAULT;
-		break;
-	}
+	flags = xfs_di2lxflags(ip->i_d.di_flags);
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
 
-	case XFS_IOC_SETXFLAGS: {
-		if (copy_from_user(&flags, arg, sizeof(flags))) {
-			error = -EFAULT;
-			break;
-		}
+STATIC int
+xfs_ioc_setxflags(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct bhv_vattr	*vattr;
+	unsigned int		flags;
+	int			attr_flags;
+	int			error;
 
-		if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-			      FS_NOATIME_FL | FS_NODUMP_FL | \
-			      FS_SYNC_FL)) {
-			error = -EOPNOTSUPP;
-			break;
-		}
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
 
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL))
+		return -EOPNOTSUPP;
 
-		vattr->va_mask = XFS_AT_XFLAGS;
-		vattr->va_xflags = xfs_merge_ioc_xflags(flags,
-							xfs_ip2xflags(ip));
+	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
+	if (unlikely(!vattr))
+		return -ENOMEM;
 
-		error = xfs_setattr(ip, vattr, attr_flags, NULL);
-		if (likely(!error))
-			vn_revalidate(XFS_ITOV(ip));	/* update flags */
-		error = -error;
-		break;
-	}
+	attr_flags = 0;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
 
-	default:
-		error = -ENOTTY;
-		break;
-	}
+	vattr->va_mask = XFS_AT_XFLAGS;
+	vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
 
+	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+	if (likely(!error))
+		vn_revalidate(XFS_ITOV(ip));	/* update flags */
 	kfree(vattr);
 	return error;
 }
@@ -1332,3 +1332,259 @@ xfs_ioc_getbmapx(
 
 	return 0;
 }
+
+int
+xfs_ioctl(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	xfs_mount_t		*mp = ip->i_mount;
+	int			error;
+
+	xfs_itrace_entry(XFS_I(inode));
+	switch (cmd) {
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+		/*
+		 * Only allow the sys admin to reserve space unless
+		 * unwritten extents are enabled.
+		 */
+		if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+			mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+		if (copy_to_user(arg, &da, sizeof(da)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
+	case XFS_IOC_FSGETXATTR:
+		return xfs_ioc_fsgetxattr(ip, 0, arg);
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
+	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
+	case XFS_IOC_SETXFLAGS:
+		return xfs_ioc_setxflags(ip, filp, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, arg, sizeof(dmi)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+				dmi.fsd_dmstate);
+		return -error;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(ip, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE:
+		return xfs_find_handle(cmd, arg);
+
+	case XFS_IOC_OPEN_BY_HANDLE:
+		return xfs_open_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(mp, arg, inode);
+
+	case XFS_IOC_READLINK_BY_HANDLE:
+		return xfs_readlink_by_handle(mp, arg, inode);
+
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(mp, arg, inode);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(mp, arg, inode);
+
+	case XFS_IOC_SWAPEXT: {
+		error = xfs_swapext((struct xfs_swapext __user *)arg);
+		return -error;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&inout, arg, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &inout, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_data(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_log(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_rt(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FREEZE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (inode->i_sb->s_frozen == SB_UNFROZEN)
+			freeze_bdev(inode->i_sb->s_bdev);
+		return 0;
+
+	case XFS_IOC_THAW:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (inode->i_sb->s_frozen != SB_UNFROZEN)
+			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+		return 0;
+
+	case XFS_IOC_GOINGDOWN: {
+		__uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (__uint32_t __user *)arg))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_fs_goingdown(mp, in);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_errortag_add(in.errtag, mp);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_errortag_clearall(mp, 1);
+		return -error;
+
+	default:
+		return -ENOTTY;
+	}
+}
-- 
cgit v0.10.2


From 433550990e6c2e94995239bac6a52b4df454cae0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 27 Mar 2008 18:01:08 +1100
Subject: [XFS] remove most calls to VN_RELE

Most VN_RELE calls either directly contain a XFS_ITOV or have the
corresponding xfs_inode already in scope. Use the IRELE helper instead of
VN_RELE to clarify the code. With a little more work we can kill VN_RELE
altogether and define IRELE in terms of iput directly.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30710a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8e9c5ae..adbc7bb 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1810,7 +1810,7 @@ xfs_qm_dqusage_adjust(
 	 * Now release the inode. This will send it to 'inactive', and
 	 * possibly even free blocks.
 	 */
-	VN_RELE(XFS_ITOV(ip));
+	IRELE(ip);
 
 	/*
 	 * Goto next inode.
@@ -1968,7 +1968,7 @@ xfs_qm_init_quotainos(
 			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
 					     0, 0, &gip, 0))) {
 				if (uip)
-					VN_RELE(XFS_ITOV(uip));
+					IRELE(uip);
 				return XFS_ERROR(error);
 			}
 		}
@@ -1999,7 +1999,7 @@ xfs_qm_init_quotainos(
 					  sbflags | XFS_SB_GQUOTINO, flags);
 		if (error) {
 			if (uip)
-				VN_RELE(XFS_ITOV(uip));
+				IRELE(uip);
 
 			return XFS_ERROR(error);
 		}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d2b8be7..3dc161f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -386,7 +386,7 @@ xfs_qm_scall_trunc_qfiles(
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
 		if (! error) {
 			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
+			IRELE(qip);
 		}
 	}
 
@@ -395,7 +395,7 @@ xfs_qm_scall_trunc_qfiles(
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
 		if (! error) {
 			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
+			IRELE(qip);
 		}
 	}
 
@@ -552,13 +552,13 @@ xfs_qm_scall_getqstat(
 		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
 		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
 		if (tempuqip)
-			VN_RELE(XFS_ITOV(uip));
+			IRELE(uip);
 	}
 	if (gip) {
 		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
 		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
 		if (tempgqip)
-			VN_RELE(XFS_ITOV(gip));
+			IRELE(gip);
 	}
 	if (mp->m_quotainfo) {
 		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -1095,7 +1095,7 @@ again:
 		 * inactive code in hell.
 		 */
 		if (vnode_refd)
-			VN_RELE(vp);
+			IRELE(ip);
 		XFS_MOUNT_ILOCK(mp);
 		/*
 		 * If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index cd24711..962d74a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
+#include "xfs_utils.h"
 
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -3248,7 +3249,7 @@ xlog_recover_process_iunlinks(
 					if (ip->i_d.di_mode == 0)
 						xfs_iput_new(ip, 0);
 					else
-						VN_RELE(XFS_ITOV(ip));
+						IRELE(ip);
 				} else {
 					/*
 					 * We can't read in the inode
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 41b690e..c2aafeb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,7 @@
 #include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
+#include "xfs_utils.h"
 
 STATIC void	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
@@ -956,7 +957,6 @@ xfs_mountfs(
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
-	bhv_vnode_t	*rvp = NULL;
 	__uint64_t	resblks;
 	__int64_t	update_flags = 0LL;
 	uint		quotamount, quotaflags;
@@ -1158,7 +1158,6 @@ xfs_mountfs(
 	}
 
 	ASSERT(rip != NULL);
-	rvp = XFS_ITOV(rip);
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
 		cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1241,7 +1240,7 @@ xfs_mountfs(
 	/*
 	 * Free up the root inode.
 	 */
-	VN_RELE(rvp);
+	IRELE(rip);
  error3:
 	xfs_log_unmount_dealloc(mp);
  error2:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47082c0..9cd6471 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
 #include "xfs_rw.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
+#include "xfs_utils.h"
 
 
 /*
@@ -2278,7 +2279,7 @@ xfs_rtmount_inodes(
 	ASSERT(sbp->sb_rsumino != NULLFSINO);
 	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
 	if (error) {
-		VN_RELE(XFS_ITOV(mp->m_rbmip));
+		IRELE(mp->m_rbmip);
 		return error;
 	}
 	ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 4c132a8..c21e4d1 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -55,6 +55,7 @@
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
 #include "xfs_vfsops.h"
+#include "xfs_utils.h"
 
 
 int __init
@@ -595,7 +596,7 @@ xfs_unmount(
 	/*
 	 * Drop the reference count
 	 */
-	VN_RELE(rvp);
+	IRELE(rip);
 
 	/*
 	 * If we're forcing a shutdown, typically because of a media error,
@@ -777,8 +778,8 @@ xfs_unmount_flush(
 		goto fscorrupt_out2;
 
 	if (rbmip) {
-		VN_RELE(XFS_ITOV(rbmip));
-		VN_RELE(XFS_ITOV(rsumip));
+		IRELE(rbmip);
+		IRELE(rsumip);
 	}
 
 	xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1156,10 +1157,10 @@ xfs_sync_inodes(
 			 * above, then wait until after we've unlocked
 			 * the inode to release the reference.  This is
 			 * because we can be already holding the inode
-			 * lock when VN_RELE() calls xfs_inactive().
+			 * lock when IRELE() calls xfs_inactive().
 			 *
 			 * Make sure to drop the mount lock before calling
-			 * VN_RELE() so that we don't trip over ourselves if
+			 * IRELE() so that we don't trip over ourselves if
 			 * we have to go for the mount lock again in the
 			 * inactive code.
 			 */
@@ -1167,7 +1168,7 @@ xfs_sync_inodes(
 				IPOINTER_INSERT(ip, mp);
 			}
 
-			VN_RELE(vp);
+			IRELE(ip);
 
 			vnode_refed = B_FALSE;
 		}
-- 
cgit v0.10.2


From 2abdb8c88110bab78bfe17e51346e735560daa02 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 27 Mar 2008 18:01:14 +1100
Subject: [XFS] Prevent xfs_bmap_check_leaf_extents() referencing unmapped
 memory.

While investigating the extent corruption bug I ran into this bug in debug
only code. xfs_bmap_check_leaf_extents() loops through the leaf blocks of
the extent btree checking that every extent is entirely before the next
extent. It also compares the last extent in the previous block to the
first extent in the current block when the previous block has been
released and potentially unmapped. So take a copy of the last extent
instead of a pointer. Also move the last extent check out of the loop
because we only need to do it once.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30718a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index bce8e3b..7d683e0 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6194,7 +6194,7 @@ xfs_bmap_check_leaf_extents(
 	xfs_mount_t		*mp;	/* file system mount structure */
 	__be64			*pp;	/* pointer to block address */
 	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
-	xfs_bmbt_rec_t		*lastp; /* pointer to previous extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
 	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
 	int			bp_release = 0;
 
@@ -6264,7 +6264,6 @@ xfs_bmap_check_leaf_extents(
 	/*
 	 * Loop over all leaf nodes checking that all extents are in the right order.
 	 */
-	lastp = NULL;
 	for (;;) {
 		xfs_fsblock_t	nextbno;
 		xfs_extnum_t	num_recs;
@@ -6285,18 +6284,16 @@ xfs_bmap_check_leaf_extents(
 		 */
 
 		ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+		if (i) {
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+		}
 		for (j = 1; j < num_recs; j++) {
 			nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-			if (lastp) {
-				xfs_btree_check_rec(XFS_BTNUM_BMAP,
-					(void *)lastp, (void *)ep);
-			}
-			xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
-				(void *)(nextp));
-			lastp = ep;
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
 			ep = nextp;
 		}
 
+		last = *ep;
 		i += num_recs;
 		if (bp_release) {
 			bp_release = 0;
-- 
cgit v0.10.2


From 114d23aae51233b2bc62d8e2a632bcb55de1953d Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:39 +1000
Subject: [XFS] Per iclog callback chain lock

Rather than use the icloglock for protecting the iclog completion callback
chain, use a new per-iclog lock so that walking the callback chain doesn't
require holding a global lock.

This reduces contention on the icloglock during transaction commit and log
I/O completion by reducing the number of times we need to hold the global
icloglock during these operations.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30770a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1fa9809..7a5b12d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -397,12 +397,10 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 	       void		  *iclog_hndl,	/* iclog to hang callback off */
 	       xfs_log_callback_t *cb)
 {
-	xlog_t *log = mp->m_log;
 	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
 	int	abortflg;
 
-	cb->cb_next = NULL;
-	spin_lock(&log->l_icloglock);
+	spin_lock(&iclog->ic_callback_lock);
 	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
 	if (!abortflg) {
 		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +409,7 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 		*(iclog->ic_callback_tail) = cb;
 		iclog->ic_callback_tail = &(cb->cb_next);
 	}
-	spin_unlock(&log->l_icloglock);
+	spin_unlock(&iclog->ic_callback_lock);
 	return abortflg;
 }	/* xfs_log_notify */
 
@@ -1257,6 +1255,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
 		iclog->ic_state = XLOG_STATE_ACTIVE;
 		iclog->ic_log = log;
+		atomic_set(&iclog->ic_refcnt, 0);
+		spin_lock_init(&iclog->ic_callback_lock);
 		iclog->ic_callback_tail = &(iclog->ic_callback);
 		iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
 
@@ -1987,7 +1987,7 @@ xlog_state_clean_log(xlog_t *log)
 		if (iclog->ic_state == XLOG_STATE_DIRTY) {
 			iclog->ic_state	= XLOG_STATE_ACTIVE;
 			iclog->ic_offset       = 0;
-			iclog->ic_callback	= NULL;   /* don't need to free */
+			ASSERT(iclog->ic_callback == NULL);
 			/*
 			 * If the number of ops in this iclog indicate it just
 			 * contains the dummy transaction, we can
@@ -2190,37 +2190,40 @@ xlog_state_do_callback(
 					be64_to_cpu(iclog->ic_header.h_lsn);
 				spin_unlock(&log->l_grant_lock);
 
-				/*
-				 * Keep processing entries in the callback list
-				 * until we come around and it is empty.  We
-				 * need to atomically see that the list is
-				 * empty and change the state to DIRTY so that
-				 * we don't miss any more callbacks being added.
-				 */
-				spin_lock(&log->l_icloglock);
 			} else {
+				spin_unlock(&log->l_icloglock);
 				ioerrors++;
 			}
-			cb = iclog->ic_callback;
 
+			/*
+			 * Keep processing entries in the callback list until
+			 * we come around and it is empty.  We need to
+			 * atomically see that the list is empty and change the
+			 * state to DIRTY so that we don't miss any more
+			 * callbacks being added.
+			 */
+			spin_lock(&iclog->ic_callback_lock);
+			cb = iclog->ic_callback;
 			while (cb) {
 				iclog->ic_callback_tail = &(iclog->ic_callback);
 				iclog->ic_callback = NULL;
-				spin_unlock(&log->l_icloglock);
+				spin_unlock(&iclog->ic_callback_lock);
 
 				/* perform callbacks in the order given */
 				for (; cb; cb = cb_next) {
 					cb_next = cb->cb_next;
 					cb->cb_func(cb->cb_arg, aborted);
 				}
-				spin_lock(&log->l_icloglock);
+				spin_lock(&iclog->ic_callback_lock);
 				cb = iclog->ic_callback;
 			}
 
 			loopdidcallbacks++;
 			funcdidcallbacks++;
 
+			spin_lock(&log->l_icloglock);
 			ASSERT(iclog->ic_callback == NULL);
+			spin_unlock(&iclog->ic_callback_lock);
 			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
 				iclog->ic_state = XLOG_STATE_DIRTY;
 
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 01c63db..104b623 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
  * - ic_offset is the current number of bytes written to in this iclog.
  * - ic_refcnt is bumped when someone is writing to the log.
  * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *	- ic_callback_*
+ *	- ic_refcnt
+ *	- fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_iclog_fields {
 	sv_t			ic_forcesema;
@@ -332,18 +345,23 @@ typedef struct xlog_iclog_fields {
 	struct xlog_in_core	*ic_prev;
 	struct xfs_buf		*ic_bp;
 	struct log		*ic_log;
-	xfs_log_callback_t	*ic_callback;
-	xfs_log_callback_t	**ic_callback_tail;
-#ifdef XFS_LOG_TRACE
-	struct ktrace		*ic_trace;
-#endif
 	int			ic_size;
 	int			ic_offset;
-	atomic_t		ic_refcnt;
 	int			ic_bwritecnt;
 	ushort_t		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
-} xlog_iclog_fields_t;
+#ifdef XFS_LOG_TRACE
+	struct ktrace		*ic_trace;
+#endif
+
+	/* Callback structures need their own cacheline */
+	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
+	xfs_log_callback_t	*ic_callback;
+	xfs_log_callback_t	**ic_callback_tail;
+
+	/* reference counts need their own cacheline */
+	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
+} xlog_iclog_fields_t ____cacheline_aligned_in_smp;
 
 typedef union xlog_in_core2 {
 	xlog_rec_header_t	hic_header;
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
 #define	ic_bp		hic_fields.ic_bp
 #define	ic_log		hic_fields.ic_log
 #define	ic_callback	hic_fields.ic_callback
+#define	ic_callback_lock hic_fields.ic_callback_lock
 #define	ic_callback_tail hic_fields.ic_callback_tail
 #define	ic_trace	hic_fields.ic_trace
 #define	ic_size		hic_fields.ic_size
-- 
cgit v0.10.2


From eb01c9cd87c7a9998c2edf209721ea069e3e3652 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:46 +1000
Subject: [XFS] Remove the xlog_ticket allocator

The ticket allocator is just a simple slab implementation internal to the
log. It requires the icloglock to be held when manipulating it and this
contributes to contention on that lock.

Just kill the entire allocator and use a memory zone instead. While there,
allow us to gracefully fail allocation with ENOMEM.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30771a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7a5b12d..3cf115d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
 #include "xfs_inode.h"
 #include "xfs_rw.h"
 
+kmem_zone_t	*xfs_log_ticket_zone;
 
 #define xlog_write_adv_cnt(ptr, len, off, bytes) \
 	{ (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
 				       xlog_ticket_t	*ticket,
 				       int		*continued_write,
 				       int		*logoffsetp);
-STATIC void xlog_state_put_ticket(xlog_t	*log,
-				  xlog_ticket_t *tic);
 STATIC int  xlog_state_release_iclog(xlog_t		*log,
 				     xlog_in_core_t	*iclog);
 STATIC void xlog_state_switch_iclogs(xlog_t		*log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 
 
 /* local ticket functions */
-STATIC void		xlog_state_ticket_alloc(xlog_t *log);
 STATIC xlog_ticket_t	*xlog_ticket_get(xlog_t *log,
 					 int	unit_bytes,
 					 int	count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t	*mp,
 		 */
 		xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
 		xlog_ungrant_log_space(log, ticket);
-		xlog_state_put_ticket(log, ticket);
+		xlog_ticket_put(log, ticket);
 	} else {
 		xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
 		xlog_regrant_reserve_log_space(log, ticket);
@@ -469,6 +467,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 		/* may sleep if need to allocate more tickets */
 		internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
 						  client, flags);
+		if (!internal_ticket)
+			return XFS_ERROR(ENOMEM);
 		internal_ticket->t_trans_type = t_type;
 		*ticket = internal_ticket;
 		xlog_trace_loggrant(log, internal_ticket, 
@@ -693,7 +693,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		if (tic) {
 			xlog_trace_loggrant(log, tic, "unmount rec");
 			xlog_ungrant_log_space(log, tic);
-			xlog_state_put_ticket(log, tic);
+			xlog_ticket_put(log, tic);
 		}
 	} else {
 		/*
@@ -1208,7 +1208,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	spin_lock_init(&log->l_icloglock);
 	spin_lock_init(&log->l_grant_lock);
 	initnsema(&log->l_flushsema, 0, "ic-flush");
-	xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
 
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1538,7 +1537,6 @@ STATIC void
 xlog_dealloc_log(xlog_t *log)
 {
 	xlog_in_core_t	*iclog, *next_iclog;
-	xlog_ticket_t	*tic, *next_tic;
 	int		i;
 
 	iclog = log->l_iclog;
@@ -1559,22 +1557,6 @@ xlog_dealloc_log(xlog_t *log)
 	spinlock_destroy(&log->l_icloglock);
 	spinlock_destroy(&log->l_grant_lock);
 
-	/* XXXsup take a look at this again. */
-	if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
-	    !XLOG_FORCED_SHUTDOWN(log)) {
-		xfs_fs_cmn_err(CE_WARN, log->l_mp,
-			"xlog_dealloc_log: (cnt: %d, total: %d)",
-			log->l_ticket_cnt, log->l_ticket_tcnt);
-		/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
-
-	} else {
-		tic = log->l_unmount_free;
-		while (tic) {
-			next_tic = tic->t_next;
-			kmem_free(tic, PAGE_SIZE);
-			tic = next_tic;
-		}
-	}
 	xfs_buf_free(log->l_xbuf);
 #ifdef XFS_LOG_TRACE
 	if (log->l_trace != NULL) {
@@ -2795,18 +2777,6 @@ xlog_ungrant_log_space(xlog_t	     *log,
 
 
 /*
- * Atomically put back used ticket.
- */
-STATIC void
-xlog_state_put_ticket(xlog_t	    *log,
-		      xlog_ticket_t *tic)
-{
-	spin_lock(&log->l_icloglock);
-	xlog_ticket_put(log, tic);
-	spin_unlock(&log->l_icloglock);
-}	/* xlog_state_put_ticket */
-
-/*
  * Flush iclog to disk if this is the last reference to the given iclog and
  * the WANT_SYNC bit is set.
  *
@@ -3176,92 +3146,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
  */
 
 /*
- *	Algorithm doesn't take into account page size. ;-(
- */
-STATIC void
-xlog_state_ticket_alloc(xlog_t *log)
-{
-	xlog_ticket_t	*t_list;
-	xlog_ticket_t	*next;
-	xfs_caddr_t	buf;
-	uint		i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
-
-	/*
-	 * The kmem_zalloc may sleep, so we shouldn't be holding the
-	 * global lock.  XXXmiken: may want to use zone allocator.
-	 */
-	buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-
-	spin_lock(&log->l_icloglock);
-
-	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
-	t_list = (xlog_ticket_t *)buf;
-	t_list->t_next = log->l_unmount_free;
-	log->l_unmount_free = t_list++;
-	log->l_ticket_cnt++;
-	log->l_ticket_tcnt++;
-
-	/* Next ticket becomes first ticket attached to ticket free list */
-	if (log->l_freelist != NULL) {
-		ASSERT(log->l_tail != NULL);
-		log->l_tail->t_next = t_list;
-	} else {
-		log->l_freelist = t_list;
-	}
-	log->l_ticket_cnt++;
-	log->l_ticket_tcnt++;
-
-	/* Cycle through rest of alloc'ed memory, building up free Q */
-	for ( ; i > 0; i--) {
-		next = t_list + 1;
-		t_list->t_next = next;
-		t_list = next;
-		log->l_ticket_cnt++;
-		log->l_ticket_tcnt++;
-	}
-	t_list->t_next = NULL;
-	log->l_tail = t_list;
-	spin_unlock(&log->l_icloglock);
-}	/* xlog_state_ticket_alloc */
-
-
-/*
- * Put ticket into free list
- *
- * Assumption: log lock is held around this call.
+ * Free a used ticket.
  */
 STATIC void
 xlog_ticket_put(xlog_t		*log,
 		xlog_ticket_t	*ticket)
 {
 	sv_destroy(&ticket->t_sema);
-
-	/*
-	 * Don't think caching will make that much difference.  It's
-	 * more important to make debug easier.
-	 */
-#if 0
-	/* real code will want to use LIFO for caching */
-	ticket->t_next = log->l_freelist;
-	log->l_freelist = ticket;
-	/* no need to clear fields */
-#else
-	/* When we debug, it is easier if tickets are cycled */
-	ticket->t_next     = NULL;
-	if (log->l_tail) {
-		log->l_tail->t_next = ticket;
-	} else {
-		ASSERT(log->l_freelist == NULL);
-		log->l_freelist = ticket;
-	}
-	log->l_tail	    = ticket;
-#endif /* DEBUG */
-	log->l_ticket_cnt++;
+	kmem_zone_free(xfs_log_ticket_zone, ticket);
 }	/* xlog_ticket_put */
 
 
 /*
- * Grab ticket off freelist or allocation some more
+ * Allocate and initialise a new log ticket.
  */
 STATIC xlog_ticket_t *
 xlog_ticket_get(xlog_t		*log,
@@ -3273,21 +3170,9 @@ xlog_ticket_get(xlog_t		*log,
 	xlog_ticket_t	*tic;
 	uint		num_headers;
 
- alloc:
-	if (log->l_freelist == NULL)
-		xlog_state_ticket_alloc(log);		/* potentially sleep */
-
-	spin_lock(&log->l_icloglock);
-	if (log->l_freelist == NULL) {
-		spin_unlock(&log->l_icloglock);
-		goto alloc;
-	}
-	tic		= log->l_freelist;
-	log->l_freelist	= tic->t_next;
-	if (log->l_freelist == NULL)
-		log->l_tail = NULL;
-	log->l_ticket_cnt--;
-	spin_unlock(&log->l_icloglock);
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+	if (!tic)
+		return NULL;
 
 	/*
 	 * Permanent reservations have up to 'cnt'-1 active log operations
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 104b623..c158396 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -242,7 +242,7 @@ typedef struct xlog_res {
 
 typedef struct xlog_ticket {
 	sv_t		   t_sema;	 /* sleep on this semaphore      : 20 */
- 	struct xlog_ticket *t_next;	 /*			         :4|8 */
+	struct xlog_ticket *t_next;	 /*			         :4|8 */
 	struct xlog_ticket *t_prev;	 /*				 :4|8 */
 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
 	int		   t_curr_res;	 /* current reservation in bytes : 4  */
@@ -406,13 +406,8 @@ typedef struct log {
 	sema_t			l_flushsema;    /* iclog flushing semaphore */
 	int			l_flushcnt;	/* # of procs waiting on this
 						 * sema */
-	int			l_ticket_cnt;	/* free ticket count */
-	int			l_ticket_tcnt;	/* total ticket count */
 	int			l_covered_state;/* state of "covering disk
 						 * log entries" */
-	xlog_ticket_t		*l_freelist;    /* free list of tickets */
-	xlog_ticket_t		*l_unmount_free;/* kmem_free these addresses */
-	xlog_ticket_t		*l_tail;        /* free list of tickets */
 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
 	spinlock_t		l_icloglock;    /* grab to change iclog state */
 	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
@@ -478,6 +473,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void	 xlog_put_bp(struct xfs_buf *);
 extern int	 xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 
+extern kmem_zone_t	*xfs_log_ticket_zone;
+
 /* iclog tracing */
 #define XLOG_TRACE_GRAB_FLUSH  1
 #define XLOG_TRACE_REL_FLUSH   2
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index c21e4d1..ea94593 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -69,15 +69,17 @@ xfs_init(void)
 	/*
 	 * Initialize all of the zone allocators we use.
 	 */
+	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+						"xfs_log_ticket");
 	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-						 "xfs_bmap_free_item");
+						"xfs_bmap_free_item");
 	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-					    "xfs_btree_cur");
-	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
-	xfs_da_state_zone =
-		kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+						"xfs_btree_cur");
+	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+						"xfs_da_state");
 	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
 	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
 	xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
 	xfs_mru_cache_init();
 	xfs_filestream_init();
-- 
cgit v0.10.2


From 4679b2d36d53ed508c956337972fbbea8db99a77 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:54 +1000
Subject: [XFS] Reorganise xlog_t for better cacheline isolation of contention

To reduce contention on the log in large CPU count, separate out different
parts of the xlog_t structure onto different cachelines. Move each lock
onto a different cacheline along with all the members that are
accessed/modified while that lock is held.

Also, move the debugging code into debug code.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30772a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3cf115d..319b98e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1237,9 +1237,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 		iclog->ic_bp = bp;
 		iclog->hic_data = bp->b_addr;
-
+#ifdef DEBUG
 		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
-
+#endif
 		head = &iclog->ic_header;
 		memset(head, 0, sizeof(xlog_rec_header_t));
 		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1250,7 +1250,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		head->h_fmt = cpu_to_be32(XLOG_FMT);
 		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
 
-
 		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
 		iclog->ic_state = XLOG_STATE_ACTIVE;
 		iclog->ic_log = log;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c158396..8952a39 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -361,7 +361,7 @@ typedef struct xlog_iclog_fields {
 
 	/* reference counts need their own cacheline */
 	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t ____cacheline_aligned_in_smp;
+} xlog_iclog_fields_t;
 
 typedef union xlog_in_core2 {
 	xlog_rec_header_t	hic_header;
@@ -402,8 +402,29 @@ typedef struct xlog_in_core {
  * that round off problems won't occur when releasing partial reservations.
  */
 typedef struct log {
+	/* The following fields don't need locking */
+	struct xfs_mount	*l_mp;	        /* mount point */
+	struct xfs_buf		*l_xbuf;        /* extra buffer for log
+						 * wrapping */
+	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	uint			l_flags;
+	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+	struct xfs_buf_cancel	**l_buf_cancel_table;
+	int			l_iclog_hsize;  /* size of iclog header */
+	int			l_iclog_heads;  /* # of iclog header sectors */
+	uint			l_sectbb_log;   /* log2 of sector size in BBs */
+	uint			l_sectbb_mask;  /* sector size (in BBs)
+						 * alignment mask */
+	int			l_iclog_size;	/* size of log in bytes */
+	int			l_iclog_size_log; /* log power size of log */
+	int			l_iclog_bufs;	/* number of iclog buffers */
+	xfs_daddr_t		l_logBBstart;   /* start block of log */
+	int			l_logsize;      /* size of log in bytes */
+	int			l_logBBsize;    /* size of log in BB chunks */
+
 	/* The following block of fields are changed while holding icloglock */
-	sema_t			l_flushsema;    /* iclog flushing semaphore */
+	sema_t			l_flushsema ____cacheline_aligned_in_smp;
+						/* iclog flushing semaphore */
 	int			l_flushcnt;	/* # of procs waiting on this
 						 * sema */
 	int			l_covered_state;/* state of "covering disk
@@ -413,27 +434,14 @@ typedef struct log {
 	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
 						 * buffers */
 	xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
-	struct xfs_mount	*l_mp;	        /* mount point */
-	struct xfs_buf		*l_xbuf;        /* extra buffer for log
-						 * wrapping */
-	struct xfs_buftarg	*l_targ;        /* buftarg of log */
-	xfs_daddr_t		l_logBBstart;   /* start block of log */
-	int			l_logsize;      /* size of log in bytes */
-	int			l_logBBsize;    /* size of log in BB chunks */
 	int			l_curr_cycle;   /* Cycle number of log writes */
 	int			l_prev_cycle;   /* Cycle number before last
 						 * block increment */
 	int			l_curr_block;   /* current logical log block */
 	int			l_prev_block;   /* previous logical log block */
-	int			l_iclog_size;	/* size of log in bytes */
-	int			l_iclog_size_log; /* log power size of log */
-	int			l_iclog_bufs;	/* number of iclog buffers */
-
-	/* The following field are used for debugging; need to hold icloglock */
-	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
 
 	/* The following block of fields are changed while holding grant_lock */
-	spinlock_t		l_grant_lock;
+	spinlock_t		l_grant_lock ____cacheline_aligned_in_smp;
 	xlog_ticket_t		*l_reserve_headq;
 	xlog_ticket_t		*l_write_headq;
 	int			l_grant_reserve_cycle;
@@ -441,19 +449,16 @@ typedef struct log {
 	int			l_grant_write_cycle;
 	int			l_grant_write_bytes;
 
-	/* The following fields don't need locking */
 #ifdef XFS_LOG_TRACE
 	struct ktrace		*l_trace;
 	struct ktrace		*l_grant_trace;
 #endif
-	uint			l_flags;
-	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-	struct xfs_buf_cancel	**l_buf_cancel_table;
-	int			l_iclog_hsize;  /* size of iclog header */
-	int			l_iclog_heads;  /* # of iclog header sectors */
-	uint			l_sectbb_log;   /* log2 of sector size in BBs */
-	uint			l_sectbb_mask;  /* sector size (in BBs)
-						 * alignment mask */
+
+	/* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
 } xlog_t;
 
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
-- 
cgit v0.10.2


From 6b1d1a732f886936fe515d911b1a01d9cc50e179 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:02 +1000
Subject: [XFS] Fix lock inversion in forced shutdown.

Recent changes to xlog_state_release_iclog() placed the grant_lock inside
the icloglock. forced unmount of the log does this the opposite way
around, but does not depend on the order for correct working. Fix the
inversion by changing the order locks are gained in
xfs_log_force_umount().

SGI-PV: 979661
SGI-Modid: xfs-linux-melb:xfs-kern:30773a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 319b98e..4a6f7c5d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3499,8 +3499,8 @@ xfs_log_force_umount(
 	 * before we mark the filesystem SHUTDOWN and wake
 	 * everybody up to tell the bad news.
 	 */
-	spin_lock(&log->l_grant_lock);
 	spin_lock(&log->l_icloglock);
+	spin_lock(&log->l_grant_lock);
 	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 	XFS_BUF_DONE(mp->m_sb_bp);
 	/*
-- 
cgit v0.10.2


From 0225da1f35df46c67785eb08526995d7cdb4e3b0 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 10 Apr 2008 12:19:10 +1000
Subject: [XFS] Replace __inline with inline

Remove the remaining uses of __inline in the XFS code base.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30774a

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da6..652721c 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
 extern struct cred *sys_cred;
 
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
+static inline int capable_cred(cred_t *cr, int cid)
 {
 	return (cr == sys_cred) ? 1 : capable(cid);
 }
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2f..afd0b0d 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
 
-static __inline void xfs_init_procfs(void) { };
-static __inline void xfs_cleanup_procfs(void) { };
+static inline void xfs_init_procfs(void) { };
+static inline void xfs_cleanup_procfs(void) { };
 
 #endif	/* !CONFIG_PROC_FS */
 
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffab..5b964fc 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
 
 # define XQM_STATS_INC(count)	do { } while (0)
 
-static __inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
 
 #endif
 
-- 
cgit v0.10.2


From 34a622b2e1c8e11c8990184634f101c1aad42fec Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 10 Apr 2008 12:19:21 +1000
Subject: [XFS] replace remaining __FUNCTION__ occurrences

__FUNCTION__ is gcc-specific, use __func__

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30775a

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1c..9b1bb17 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 #ifdef DEBUG
 	if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
 		printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-			__FUNCTION__, (long)size);
+			__func__, (long)size);
 		dump_stack();
 	}
 #endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 		if (!(++retries % 100))
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
+					__func__, lflags);
 		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 		if (!(++retries % 100))
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
+					__func__, lflags);
 		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd..142ddbe 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
 				printk(KERN_ERR
 					"XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, gfp_mask);
+					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
 			printk(KERN_WARNING "%s: failed to map pages\n",
-					__FUNCTION__);
+					__func__);
 			goto no_buffer;
 		}
 	}
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
 	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 	if (unlikely(error)) {
 		printk(KERN_WARNING "%s: failed to map pages\n",
-				__FUNCTION__);
+				__func__);
 		goto fail_free_mem;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 72e55db..fb561be 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -935,7 +935,7 @@ xfs_fs_clear_inode(
 		xfs_inactive(ip);
 		xfs_iflags_clear(ip, XFS_IMODIFIED);
 		if (xfs_reclaim(ip))
-			panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode);
+			panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 	}
 
 	ASSERT(XFS_I(inode) == NULL);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 4ed5914..dbb8a5d 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -288,9 +288,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
 extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
 extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
 #define xfs_itrace_entry(ip)	\
-	_xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address)
+	_xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit(ip)	\
-	_xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address)
+	_xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit_tag(ip, tag)	\
 	_xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
 #define xfs_itrace_ref(ip)	\
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbe..bd5c017 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -55,17 +55,17 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 ktrace_t *xfs_alloc_trace_buf;
 
 #define	TRACE_ALLOC(s,a)	\
-	xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
+	xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
 #define	TRACE_FREE(s,a,b,x,f)	\
-	xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
+	xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
 #define	TRACE_MODAGF(s,a,f)	\
-	xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
-#define	TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define	TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+	xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
+#define	TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define	TRACE_UNBUSY(__func__,s,ag,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define	TRACE_ALLOC(s,a)
 #define	TRACE_FREE(s,a,b,x,f)
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7d683e0..65b8fa8 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
 	int		whichfork);	/* data or attr fork */
 
 #define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)	\
-	xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
+	xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
 #define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)	\
-	xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
+	xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
 #define	XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
+	xfs_bmap_trace_post_update(__func__,d,ip,i,w)
 #define	XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
+	xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
 #else
 #define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
 #define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -6164,10 +6164,10 @@ xfs_check_block(
 			}
 			if (*thispa == *pp) {
 				cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
-					__FUNCTION__, j, i,
+					__func__, j, i,
 					(unsigned long long)be64_to_cpu(*thispa));
 				panic("%s: ptrs are equal in node\n",
-					__FUNCTION__);
+					__func__);
 			}
 		}
 	}
@@ -6324,13 +6324,13 @@ xfs_bmap_check_leaf_extents(
 	return;
 
 error0:
-	cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+	cmn_err(CE_WARN, "%s: at error0", __func__);
 	if (bp_release)
 		xfs_trans_brelse(NULL, bp);
 error_norelse:
 	cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
-		__FUNCTION__, i);
-	panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
 	return;
 }
 #endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7..6ff70cd 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
 	xfs_extnum_t		cnt,		/* count of entries in list */
 	int			whichfork);	/* data or attr fork */
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
-	xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
+	xfs_bmap_trace_exlist(__func__,ip,c,w)
 #else
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 93470b7..4f0e849 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
 }
 
 #define	XFS_BMBT_TRACE_ARGBI(c,b,i)	\
-	xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
+	xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
 #define	XFS_BMBT_TRACE_ARGBII(c,b,i,j)	\
-	xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
+	xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
 #define	XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)	\
-	xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
+	xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define	XFS_BMBT_TRACE_ARGI(c,i)	\
-	xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
+	xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIFK(c,i,f,s)	\
-	xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
+	xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIFR(c,i,f,r)	\
-	xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
+	xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIK(c,i,k)	\
-	xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
+	xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
 #define	XFS_BMBT_TRACE_CURSOR(c,s)	\
-	xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
+	xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
 #else
 #define	XFS_BMBT_TRACE_ARGBI(c,b,i)
 #define	XFS_BMBT_TRACE_ARGBII(c,b,i,j)
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab..3f3785b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
 #define TRACE4(mp,t,a0,a1,a2,a3)	TRACE6(mp,t,a0,a1,a2,a3,0,0)
 #define TRACE5(mp,t,a0,a1,a2,a3,a4)	TRACE6(mp,t,a0,a1,a2,a3,a4,0)
 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
-	xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
+	xfs_filestreams_trace(mp, t, __func__, __LINE__, \
 				(__psunsigned_t)a0, (__psunsigned_t)a1, \
 				(__psunsigned_t)a2, (__psunsigned_t)a3, \
 				(__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4a6f7c5d..bece882 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2225,7 +2225,7 @@ xlog_state_do_callback(
 			repeats = 0;
 			xfs_fs_cmn_err(CE_WARN, log->l_mp,
 				"%s: possible infinite loop (%d iterations)",
-				__FUNCTION__, flushcnt);
+				__func__, flushcnt);
 		}
 	} while (!ioerrors && loopdidcallbacks);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 962d74a..c375214 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -192,7 +192,7 @@ xlog_header_check_dump(
 {
 	int			b;
 
-	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
+	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -3447,7 +3447,7 @@ xlog_valid_rec_header(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
 		xlog_warn("XFS: %s: unrecognised log version (%d).",
-			__FUNCTION__, be32_to_cpu(rhead->h_version));
+			__func__, be32_to_cpu(rhead->h_version));
 		return XFS_ERROR(EIO);
 	}
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 13235ae..1f77c00 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -438,7 +438,7 @@ xfs_trans_delete_ail(
 		else {
 			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
 		"%s: attempting to delete a log item that is not in the AIL",
-					__FUNCTION__);
+					__func__);
 			spin_unlock(&mp->m_ail_lock);
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		}
-- 
cgit v0.10.2


From b6ddc4e6fed9c6f4adb273c8b36e1731f90ec17e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 10 Apr 2008 12:19:27 +1000
Subject: [XFS] Don't validate symlink target component length

This target component validation is not POSIX conformant and it is not
done by any other Linux filesystem so remove it from XFS.

SGI-PV: 980080
SGI-Modid: xfs-linux-melb:xfs-kern:30776a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 3418c94..d46f24c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3105,31 +3105,6 @@ xfs_symlink(
 	pathlen = strlen(target_path);
 	if (pathlen >= MAXPATHLEN)      /* total string too long */
 		return XFS_ERROR(ENAMETOOLONG);
-	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
-		int len, total;
-		char *path;
-
-		for (total = 0, path = target_path; total < pathlen;) {
-			/*
-			 * Skip any slashes.
-			 */
-			while(*path == '/') {
-				total++;
-				path++;
-			}
-
-			/*
-			 * Count up to the next slash or end of path.
-			 * Error out if the component is bigger than MAXNAMELEN.
-			 */
-			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
-				if (++len >= MAXNAMELEN) {
-					error = ENAMETOOLONG;
-					return error;
-				}
-			}
-		}
-	}
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
-- 
cgit v0.10.2


From 3c85c36cc2e87018d38fcd033f41bbdf1360c07a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:40 +1000
Subject: [XFS] xfs_quiesce_fs() never returns an error. Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30780a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index ea94593..6351efb 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -637,7 +637,7 @@ out:
 	return XFS_ERROR(error);
 }
 
-STATIC int
+STATIC void
 xfs_quiesce_fs(
 	xfs_mount_t		*mp)
 {
@@ -661,8 +661,6 @@ xfs_quiesce_fs(
 			count++;
 		}
 	} while (count < 2);
-
-	return 0;
 }
 
 /*
-- 
cgit v0.10.2


From a414047fc97aea7db6237176ce00013117839cd5 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:47 +1000
Subject: [XFS] Remove useless whitespace in function prototypes

Makes it simpler to annotate function prototypes with __must_check via sed
scripts.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30781a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index c4c4a6a..701accb 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,14 +21,14 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
-extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
-extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
 				xfs_inode_t **, int *);
-extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
 
 #endif	/* __XFS_UTILS_H__ */
-- 
cgit v0.10.2


From 36fbe6e6bd5408b09341043dfece978b4a7a0f34 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:56 +1000
Subject: [XFS] xfs_icsb_counter_disabled() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30782a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c2aafeb..eb348c1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -58,7 +58,7 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
 STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
 						int64_t, int);
-STATIC int	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 
 #else
 
@@ -2183,7 +2183,7 @@ xfs_icsb_counter_disabled(
 	return test_bit(field, &mp->m_icsb_counters);
 }
 
-STATIC int
+STATIC void
 xfs_icsb_disable_counter(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field)
@@ -2201,7 +2201,7 @@ xfs_icsb_disable_counter(
 	 * the m_icsb_mutex.
 	 */
 	if (xfs_icsb_counter_disabled(mp, field))
-		return 0;
+		return;
 
 	xfs_icsb_lock_all_counters(mp);
 	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2224,8 +2224,6 @@ xfs_icsb_disable_counter(
 	}
 
 	xfs_icsb_unlock_all_counters(mp);
-
-	return 0;
 }
 
 STATIC void
-- 
cgit v0.10.2


From 714082bc12b6c305f825411df02177efcb0085f1 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:03 +1000
Subject: [XFS] Report errors from xfs_reserve_blocks().

xfs_reserve_blocks() can fail in interesting ways. In neither case is it a
fatal error, but the result can lead to sub-optimal behaviour. Warn to the
syslog if the call fails but otherwise continue.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30784a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb348c1..244aa1b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1227,12 +1227,15 @@ xfs_mountfs(
 	 *
 	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
 	 * This may drive us straight to ENOSPC on mount, but that implies
-	 * we were already there on the last unmount.
+	 * we were already there on the last unmount. Warn if this occurs.
 	 */
 	resblks = mp->m_sb.sb_dblocks;
 	do_div(resblks, 20);
 	resblks = min_t(__uint64_t, resblks, 1024);
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+				"Continuing without a reserve pool.");
 
 	return 0;
 
@@ -1268,6 +1271,7 @@ int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
 	__uint64_t	resblks;
+	int		error = 0;
 
 	/*
 	 * We can potentially deadlock here if we have an inode cluster
@@ -1311,7 +1315,11 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 	 * value does not matter....
 	 */
 	resblks = 0;
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+				"Freespace may not be correct on next mount.");
+
 
 	xfs_log_sbcount(mp, 1);
 	xfs_unmountfs_writesb(mp);
-- 
cgit v0.10.2


From 5b1397385bf536cbdb60f3362f44079d15d5f5ee Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:10 +1000
Subject: [XFS] xfs_qm_reset_dqcounts() does not return errors.

Declare it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30785a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index adbc7bb..dec5f95 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
 }
 
 
-STATIC int
+STATIC void
 xfs_qm_reset_dqcounts(
 	xfs_mount_t	*mp,
 	xfs_buf_t	*bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
 		ddq->d_rtbwarns = 0;
 		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
 	}
-
-	return 0;
 }
 
 STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
 		if (error)
 			break;
 
-		(void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
 		xfs_bdwrite(mp, bp);
 		/*
 		 * goto the next block.
-- 
cgit v0.10.2


From 4b8879df8c21bed3efd1eb2da5d72501199aba29 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:17 +1000
Subject: [XFS] Propagate xfs_qm_dqflush_all() errors.

xfs_qm_dqflush_all() can return flush errors. Ensure they are propagated
into the quotacheck code to determine if the quotacheck succeeded or not.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30786a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index dec5f95..04b29c6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1878,6 +1878,14 @@ xfs_qm_quotacheck(
 	} while (! done);
 
 	/*
+	 * We've made all the changes that we need to make incore.
+	 * Flush them down to disk buffers if everything was updated
+	 * successfully.
+	 */
+	if (!error)
+		error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+
+	/*
 	 * We can get this error if we couldn't do a dquot allocation inside
 	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
 	 * dirty dquots that might be cached, we just want to get rid of them
@@ -1888,11 +1896,6 @@ xfs_qm_quotacheck(
 		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
 		goto error_return;
 	}
-	/*
-	 * We've made all the changes that we need to make incore.
-	 * Now flush_them down to disk buffers.
-	 */
-	xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
 
 	/*
 	 * We didn't log anything, because if we crashed, we'll have to
-- 
cgit v0.10.2


From 3c56836f92683cb871ebbf44c512069b0d48a08f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:24 +1000
Subject: [XFS] Check for dquot flush errors

xfs_qm_dqflush() can fail, but the return is not checked anywhere. Hence
we never know if we've failed to flush a dquot to disk. Propagate the
error and warn to the syslog if a flush ever fails.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30787a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babc..15214fb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
 	uint		flags)
 {
 	xfs_dqhash_t	*thishash;
-	xfs_mount_t	*mp;
-
-	mp = dqp->q_mount;
+	xfs_mount_t	*mp = dqp->q_mount;
 
 	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
 	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
 	 * we're unmounting, we do care, so we flush it and wait.
 	 */
 	if (XFS_DQ_IS_DIRTY(dqp)) {
+		int	error;
 		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
 		/* dqflush unlocks dqflock */
 		/*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		(void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		if (error)
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"xfs_qm_dqpurge: dquot %p flush failed", dqp);
 		xfs_dqflock(dqp);
 	}
 	ASSERT(dqp->q_pincount == 0);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d..3dedce1 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
 	xfs_dq_logitem_t	*logitem)
 {
 	xfs_dquot_t	*dqp;
+	int		error;
 
 	dqp = logitem->qli_dquot;
 
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
 	 * lock without sleeping, then there must not have been
 	 * anyone in the process of flushing the dquot.
 	 */
-	xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+	error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+	if (error)
+		xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+			error, dqp);
 	xfs_dqunlock(dqp);
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 04b29c6..0ed3c82 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -2094,12 +2094,17 @@ xfs_qm_shake_freelist(
 		 * dirty dquots.
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
+			int	error;
 			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the mplock.
 			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			if (error) {
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+			}
 			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
 			dqp = dqp->dq_flnext;
 			continue;
@@ -2266,12 +2271,17 @@ xfs_qm_dqreclaim_one(void)
 		 * dirty dquots.
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
+			int	error;
 			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the freelist lock.
 			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			if (error) {
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+			}
 			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
 			continue;
 		}
-- 
cgit v0.10.2


From 53aa7915d67b9d0f5986c9f08e76846fedc520d4 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:31 +1000
Subject: [XFS] Clean up quotamount error handling.

xfs_qm_mount_quotas() returns an error status that is ignored. If we fail
to mount quotas, we continue with quota's turned off, which is all handled
inside xfs_qm_mount_quotas(). Mark it as void to indicate that errors need
not be returned to the callers.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30788a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 0ed3c82..e15ee7c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
  * necessary data structures like quotainfo.  This is also responsible for
  * running a quotacheck as necessary.  We are guaranteed that the superblock
  * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
  */
-int
+void
 xfs_qm_mount_quotas(
 	xfs_mount_t	*mp,
 	int		mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
 	int		error = 0;
 	uint		sbf;
 
-
 	/*
 	 * If quotas on realtime volumes is not supported, we disable
 	 * quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
 	 * Allocate the quotainfo structure inside the mount struct, and
 	 * create quotainode(s), and change/rev superblock if necessary.
 	 */
-	if ((error = xfs_qm_init_quotainfo(mp))) {
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
 		/*
 		 * We must turn off quotas.
 		 */
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
 	 * If any of the quotas are not consistent, do a quotacheck.
 	 */
 	if (XFS_QM_NEED_QUOTACHECK(mp) &&
-		!(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-		if ((error = xfs_qm_quotacheck(mp))) {
-			/* Quotacheck has failed and quotas have
-			 * been disabled.
-			 */
-			return XFS_ERROR(error);
+	    !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
 		}
 	}
 	/* 
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
 	 * quotachecked status, since we won't be doing accounting for
 	 * that type anymore.
 	 */
-	if (!XFS_IS_UQUOTA_ON(mp)) {
+	if (!XFS_IS_UQUOTA_ON(mp))
 		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-	}
-	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
+	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
 		mp->m_qflags &= ~XFS_OQUOTA_CHKD;
-	}
 
  write_changes:
 	/*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
 		xfs_fs_cmn_err(CE_WARN, mp,
 			"Failed to initialize disk quotas.");
 	}
-	return XFS_ERROR(error);
+	return;
 }
 
 /*
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c..cd2300e 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int		xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void		xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern void		xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int		xfs_qm_unmount_quotas(xfs_mount_t *);
-- 
cgit v0.10.2


From 31d5577b35d8397dea19f2ba7550e9225605a785 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:38 +1000
Subject: [XFS] Catch errors resetting quota flags.

Warn to the syslog if we fail to reset the quota flags in the superblock
when a quota check fails.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30789a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index e15ee7c..6aa3445 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1927,7 +1927,10 @@ xfs_qm_quotacheck(
 		ASSERT(mp->m_quotainfo != NULL);
 		ASSERT(xfs_Gqm != NULL);
 		xfs_qm_destroy_quotainfo(mp);
-		(void)xfs_mount_reset_sbqflags(mp);
+		if (xfs_mount_reset_sbqflags(mp)) {
+			cmn_err(CE_WARN, "XFS quotacheck %s: "
+				"Failed to reset quota flags.", mp->m_fsname);
+		}
 	} else {
 		cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
 	}
-- 
cgit v0.10.2


From cb6edc26c386d2268dcf61bcdec02b6fb50b6ba2 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:45 +1000
Subject: [XFS] Catch errors when turning off quotas.

When turning off quota, we need to write various transactions to the log
to ensure that they are cleanly removed in the case of a crash. We need to
check that the transactions hit the disk correctly. If we fail to write
the final quota off transaction, we are corrupt in memory and so the only
option is to shut the filesystem down at this point.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30790a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 3dc161f..61cf68d 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
 
 	/*
 	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-	 * and synchronously.
+	 * and synchronously. If we fail to write, we should abort the
+	 * operation as it cannot be recovered safely if we crash.
 	 */
-	xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	if (error)
+		goto out_error;
 
 	/*
 	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
 	 * So, we have QUOTAOFF start and end logitems; the start
 	 * logitem won't get overwritten until the end logitem appears...
 	 */
-	xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	if (error) {
+		/* We're screwed now. Shutdown is the only option. */
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		goto out_error;
+	}
 
 	/*
 	 * If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
 		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
 		XFS_QI_GQIP(mp) = NULL;
 	}
+out_error:
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
 	return (error);
-- 
cgit v0.10.2


From 88ab02085363b7c45935d66ab3e969b4fec9a20c Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:51 +1000
Subject: [XFS] Propagate quota file truncation errors.

Truncating the quota files can silently fail. Ensure that truncation
errors are propagated to the callers.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30791a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 61cf68d..556018d 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -380,12 +380,11 @@ xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int		error;
+	int		error = 0, error2 = 0;
 	xfs_inode_t	*qip;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return XFS_ERROR(EPERM);
-	error = 0;
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
 		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
@@ -393,22 +392,22 @@ xfs_qm_scall_trunc_qfiles(
 
 	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
+		if (!error) {
+			error = xfs_truncate_file(mp, qip);
 			IRELE(qip);
 		}
 	}
 
 	if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
 	    mp->m_sb.sb_gquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
+		error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+		if (!error2) {
+			error2 = xfs_truncate_file(mp, qip);
 			IRELE(qip);
 		}
 	}
 
-	return (error);
+	return error ? error : error2;
 }
 
 
-- 
cgit v0.10.2


From 0c928299676c8df2b00e75d5691cd4846e6c0868 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:58 +1000
Subject: [XFS] Catch errors from xfs_acl_setmode().

Propagate the error status from xfs_acl_setmode() so that callers know if
the ACl was set correctly or not.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30792a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe3..98b515d 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
 
 	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
-	if (error)
-		goto out;
 
 	/* Incoming ACL exists, set file mode based on its value */
-	if (kind == _ACL_TYPE_ACCESS)
-		xfs_acl_setmode(vp, xfs_acl, &basicperms);
+	if (!error && kind == _ACL_TYPE_ACCESS)
+		error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
+
+	if (error)
+		goto out;
 
 	/*
 	 * If we have more than std unix permissions, set up the actual attr.
@@ -707,7 +708,9 @@ xfs_acl_inherit(
 
 	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
 	xfs_acl_filter_mode(mode, cacl);
-	xfs_acl_setmode(vp, cacl, &basicperms);
+	error = xfs_acl_setmode(vp, cacl, &basicperms);
+	if (error)
+		goto out_error;
 
 	/*
 	 * Set the Default and Access ACL on the file.  The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
 		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
 	if (!error && !basicperms)
 		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+out_error:
 	_ACL_FREE(cacl);
 	return error;
 }
-- 
cgit v0.10.2


From 5ca1f261a08d5cff5f29eaa0887b59baae2ae7f7 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:04 +1000
Subject: [XFS] Catch errors from xfs_acl_vremove().

Removing an ACL can return an error. Propagate it.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30793a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 98b515d..8e130b9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -324,7 +324,7 @@ xfs_acl_vset(
 	if (!basicperms) {
 		xfs_acl_set_attr(vp, xfs_acl, kind, &error);
 	} else {
-		xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+		error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
 	}
 
 out:
-- 
cgit v0.10.2


From 3c1e2bbe5bcdcd435510a05eb121fa74b848e24f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:11 +1000
Subject: [XFS] Propagate xfs_trans_reserve() errors.

xfs_trans_reserve() reports errors that should not be ignored. For
example, a shutdown filesystem will report errors through
xfs_trans_reserve() to prevent further changes from being attempted on a
damaged filesystem. Catch and propagate all error conditions from
xfs_trans_reserve().

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30794a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c375214..957b8ca 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2965,7 +2965,7 @@ xlog_recover_process_data(
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
-STATIC void
+STATIC int
 xlog_recover_process_efi(
 	xfs_mount_t		*mp,
 	xfs_efi_log_item_t	*efip)
@@ -2973,6 +2973,7 @@ xlog_recover_process_efi(
 	xfs_efd_log_item_t	*efdp;
 	xfs_trans_t		*tp;
 	int			i;
+	int			error = 0;
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 
@@ -2996,12 +2997,16 @@ xlog_recover_process_efi(
 			 * free the memory associated with it.
 			 */
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
-			return;
+			return XFS_ERROR(EIO);
 		}
 	}
 
 	tp = xfs_trans_alloc(mp, 0);
-	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return error;
+	}
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -3013,6 +3018,7 @@ xlog_recover_process_efi(
 
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	xfs_trans_commit(tp, 0);
+	return error;
 }
 
 /*
@@ -3060,7 +3066,7 @@ xlog_recover_check_ail(
  * everything already in the AIL, we stop processing as soon as
  * we see something other than an EFI in the AIL.
  */
-STATIC void
+STATIC int
 xlog_recover_process_efis(
 	xlog_t			*log)
 {
@@ -3068,6 +3074,7 @@ xlog_recover_process_efis(
 	xfs_efi_log_item_t	*efip;
 	int			gen;
 	xfs_mount_t		*mp;
+	int			error = 0;
 
 	mp = log->l_mp;
 	spin_lock(&mp->m_ail_lock);
@@ -3092,11 +3099,14 @@ xlog_recover_process_efis(
 		}
 
 		spin_unlock(&mp->m_ail_lock);
-		xlog_recover_process_efi(mp, efip);
+		error = xlog_recover_process_efi(mp, efip);
+		if (error)
+			return error;
 		spin_lock(&mp->m_ail_lock);
 		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
 	}
 	spin_unlock(&mp->m_ail_lock);
+	return error;
 }
 
 /*
@@ -3116,9 +3126,9 @@ xlog_recover_clear_agi_bucket(
 	int		error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
-
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+	if (!error)
+		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
 	if (error) {
@@ -3919,7 +3929,14 @@ xlog_recover_finish(
 	 * rather than accepting new requests.
 	 */
 	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
-		xlog_recover_process_efis(log);
+		int	error;
+		error = xlog_recover_process_efis(log);
+		if (error) {
+			cmn_err(CE_ALERT,
+				"Failed to recover EFIs on filesystem: %s",
+				log->l_mp->m_fsname);
+			return error;
+		}
 		/*
 		 * Sync the log to get all the EFIs out of the AIL.
 		 * This isn't absolutely necessary, but it helps in
-- 
cgit v0.10.2


From e5720eec0548c08943d759e39db0388d8fe59287 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:18 +1000
Subject: [XFS] Propagate errors from xfs_trans_commit().

xfs_trans_commit() can return errors when there are problems in the
transaction subsystem. They are indicative that the entire transaction may
be incomplete, and hence the error should be propagated as there is a good
possibility that there is something fatally wrong in the filesystem. Catch
and propagate or warn about commit errors in the places where they are
currently ignored.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30795a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6aa3445..40ea564 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -2392,9 +2392,9 @@ xfs_qm_write_sb_changes(
 	}
 
 	xfs_mod_sb(tp, flags);
-	(void) xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 
-	return 0;
+	return error;
 }
 
 
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 556018d..8342823 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -734,12 +734,12 @@ xfs_qm_scall_setqlim(
 	xfs_trans_log_dquot(tp, dqp);
 
 	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 	xfs_qm_dqprint(dqp);
 	xfs_qm_dqrele(dqp);
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
-	return (0);
+	return error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d7514f8..63e6689 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1699,33 +1699,16 @@ xfs_itruncate_finish(
 			 * blocks in the file system, but oh well.
 			 */
 			xfs_bmap_cancel(&free_list);
-			if (committed) {
-				/*
-				 * If the passed in transaction committed
-				 * in xfs_bmap_finish(), then we want to
-				 * add the inode to this one before returning.
-				 * This keeps things simple for the higher
-				 * level code, because it always knows that
-				 * the inode is locked and held in the
-				 * transaction that returns to it whether
-				 * errors occur or not.  We don't mark the
-				 * inode dirty so that this transaction can
-				 * be easily aborted if possible.
-				 */
-				xfs_trans_ijoin(ntp, ip,
-					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-				xfs_trans_ihold(ntp, ip);
-			}
+			if (committed)
+				goto error_join;
 			return error;
 		}
 
 		if (committed) {
 			/*
-			 * The first xact was committed,
-			 * so add the inode to the new one.
-			 * Mark it dirty so it will be logged
-			 * and moved forward in the log as
-			 * part of every commit.
+			 * The first xact was committed, so add the inode to
+			 * the new one.  Mark it dirty so it will be logged and
+			 * moved forward in the log as part of every commit.
 			 */
 			xfs_trans_ijoin(ntp, ip,
 					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -1733,19 +1716,16 @@ xfs_itruncate_finish(
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
 		ntp = xfs_trans_dup(ntp);
-		(void) xfs_trans_commit(*tp, 0);
+		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
+		if (error)
+			goto error_join;
 		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 					  XFS_TRANS_PERM_LOG_RES,
 					  XFS_ITRUNCATE_LOG_COUNT);
-		/*
-		 * Add the inode being truncated to the next chained
-		 * transaction.
-		 */
-		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ihold(ntp, ip);
 		if (error)
-			return (error);
+			goto error_join;
+
 	}
 	/*
 	 * Only update the size in the case of the data fork, but
@@ -1777,6 +1757,18 @@ xfs_itruncate_finish(
 	       (ip->i_d.di_nextents == 0));
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
 	return 0;
+
+error_join:
+	/*
+	 * Add the inode being truncated to the next chained transaction.  This
+	 * keeps things simple for the higher level code, because it always
+	 * knows that the inode is locked and held in the transaction that
+	 * returns to it whether errors occur or not.  We don't mark the inode
+	 * dirty so that this transaction can be easily aborted if possible.
+	 */
+	xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(ntp, ip);
+	return error;
 }
 
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 957b8ca..418582b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3017,7 +3017,7 @@ xlog_recover_process_efi(
 	}
 
 	efip->efi_flags |= XFS_EFI_RECOVERED;
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 	return error;
 }
 
@@ -3131,16 +3131,13 @@ xlog_recover_clear_agi_bucket(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
-	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return;
-	}
+	if (error)
+		goto out_abort;
 
+	error = EINVAL;
 	agi = XFS_BUF_TO_AGI(agibp);
-	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return;
-	}
+	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
+		goto out_abort;
 
 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3148,7 +3145,17 @@ xlog_recover_clear_agi_bucket(
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 
-	(void) xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		goto out_error;
+	return;
+
+out_abort:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+	xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+			"failed to clear agi %d. Continuing.", agno);
+	return;
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 244aa1b..2d03fe1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC void	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+STATIC int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
@@ -1189,8 +1189,13 @@ xfs_mountfs(
 	/*
 	 * If fs is not mounted readonly, then update the superblock changes.
 	 */
-	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY))
-		xfs_mount_log_sb(mp, update_flags);
+	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_mount_log_sb(mp, update_flags);
+		if (error) {
+			cmn_err(CE_WARN, "XFS: failed to write sb changes");
+			goto error4;
+		}
+	}
 
 	/*
 	 * Initialise the XFS quota management subsystem for this mount
@@ -1320,8 +1325,10 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 		cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
 
-
-	xfs_log_sbcount(mp, 1);
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
 	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
 	xfs_log_unmount(mp);			/* Done! No more fs ops. */
@@ -1413,9 +1420,8 @@ xfs_log_sbcount(
 	xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
 	if (sync)
 		xfs_trans_set_sync(tp);
-	xfs_trans_commit(tp, 0);
-
-	return 0;
+	error = xfs_trans_commit(tp, 0);
+	return error;
 }
 
 STATIC void
@@ -1913,24 +1919,27 @@ xfs_uuid_unmount(
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
  */
-STATIC void
+STATIC int
 xfs_mount_log_sb(
 	xfs_mount_t	*mp,
 	__int64_t	fields)
 {
 	xfs_trans_t	*tp;
+	int		error;
 
 	ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
 			 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				XFS_DEFAULT_LOG_COUNT)) {
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				XFS_DEFAULT_LOG_COUNT);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return;
+		return error;
 	}
 	xfs_mod_sb(tp, fields);
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
+	return error;
 }
 
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9cd6471..a0dc6e5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -124,14 +124,14 @@ xfs_growfs_rt_alloc(
 				XFS_GROWRTALLOC_LOG_RES(mp), 0,
 				XFS_TRANS_PERM_LOG_RES,
 				XFS_DEFAULT_PERM_LOG_COUNT)))
-			goto error_exit;
+			goto error_cancel;
 		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 		/*
 		 * Lock the inode.
 		 */
 		if ((error = xfs_trans_iget(mp, tp, ino, 0,
 						XFS_ILOCK_EXCL, &ip)))
-			goto error_exit;
+			goto error_cancel;
 		XFS_BMAP_INIT(&flist, &firstblock);
 		/*
 		 * Allocate blocks to the bitmap file.
@@ -144,14 +144,16 @@ xfs_growfs_rt_alloc(
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
-			goto error_exit;
+			goto error_cancel;
 		/*
 		 * Free any blocks freed up in the transaction, then commit.
 		 */
 		error = xfs_bmap_finish(&tp, &flist, &committed);
 		if (error)
-			goto error_exit;
-		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+			goto error_cancel;
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error;
 		/*
 		 * Now we need to clear the allocated blocks.
 		 * Do this one block per transaction, to keep it simple.
@@ -166,13 +168,13 @@ xfs_growfs_rt_alloc(
 			 */
 			if ((error = xfs_trans_reserve(tp, 0,
 					XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
-				goto error_exit;
+				goto error_cancel;
 			/*
 			 * Lock the bitmap inode.
 			 */
 			if ((error = xfs_trans_iget(mp, tp, ino, 0,
 							XFS_ILOCK_EXCL, &ip)))
-				goto error_exit;
+				goto error_cancel;
 			/*
 			 * Get a buffer for the block.
 			 */
@@ -181,14 +183,16 @@ xfs_growfs_rt_alloc(
 				mp->m_bsize, 0);
 			if (bp == NULL) {
 				error = XFS_ERROR(EIO);
-				goto error_exit;
+				goto error_cancel;
 			}
 			memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
 			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
 			/*
 			 * Commit the transaction.
 			 */
-			xfs_trans_commit(tp, 0);
+			error = xfs_trans_commit(tp, 0);
+			if (error)
+				goto error;
 		}
 		/*
 		 * Go on to the next extent, if any.
@@ -196,8 +200,9 @@ xfs_growfs_rt_alloc(
 		oblocks = map.br_startoff + map.br_blockcount;
 	}
 	return 0;
-error_exit:
+error_cancel:
 	xfs_trans_cancel(tp, cancelflags);
+error:
 	return error;
 }
 
@@ -1876,6 +1881,7 @@ xfs_growfs_rt(
 	xfs_trans_t	*tp;		/* transaction pointer */
 
 	sbp = &mp->m_sb;
+	cancelflags = 0;
 	/*
 	 * Initial error checking.
 	 */
@@ -2042,13 +2048,15 @@ xfs_growfs_rt(
 		 */
 		mp->m_rsumlevels = nrsumlevels;
 		mp->m_rsumsize = nrsumsize;
-		/*
-		 * Commit the transaction.
-		 */
-		xfs_trans_commit(tp, 0);
+
+		error = xfs_trans_commit(tp, 0);
+		if (error) {
+			tp = NULL;
+			break;
+		}
 	}
 
-	if (error)
+	if (error && tp)
 		xfs_trans_cancel(tp, cancelflags);
 
 	/*
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 6351efb..09e186d 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -672,6 +672,8 @@ void
 xfs_attr_quiesce(
 	xfs_mount_t	*mp)
 {
+	int	error = 0;
+
 	/* wait for all modifications to complete */
 	while (atomic_read(&mp->m_active_trans) > 0)
 		delay(100);
@@ -682,7 +684,11 @@ xfs_attr_quiesce(
 	ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
 
 	/* Push the superblock and write an unmount record */
-	xfs_log_sbcount(mp, 1);
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		xfs_fs_cmn_err(CE_WARN, mp,
+				"xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
 	xfs_log_unmount_write(mp);
 	xfs_unmountfs_writesb(mp);
 }
@@ -1316,8 +1322,11 @@ xfs_syncsub(
 	 * of sync if we crash or get a forced shutdown. We don't want to force
 	 * this to disk, just get a transaction into the iclogs....
 	 */
-	if (flags & SYNC_SUPER)
-		xfs_log_sbcount(mp, 0);
+	if (flags & SYNC_SUPER) {
+		error = xfs_log_sbcount(mp, 0);
+		if (error)
+			last_error = error;
+	}
 
 	/*
 	 * Now check to see if the log needs a "dummy" transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d46f24c..bc0a470 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1447,28 +1447,22 @@ xfs_inactive_attrs(
 	tp = *tpp;
 	mp = ip->i_mount;
 	ASSERT(ip->i_d.di_forkoff != 0);
-	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto error_unlock;
 
 	error = xfs_attr_inactive(ip);
-	if (error) {
-		*tpp = NULL;
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error; /* goto out */
-	}
+	if (error)
+		goto error_unlock;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 	error = xfs_trans_reserve(tp, 0,
 				  XFS_IFREE_LOG_RES(mp),
 				  0, XFS_TRANS_PERM_LOG_RES,
 				  XFS_INACTIVE_LOG_COUNT);
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp, 0);
-		*tpp = NULL;
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error;
-	}
+	if (error)
+		goto error_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1479,6 +1473,14 @@ xfs_inactive_attrs(
 
 	*tpp = tp;
 	return 0;
+
+error_cancel:
+	ASSERT(XFS_FORCED_SHUTDOWN(mp));
+	xfs_trans_cancel(tp, 0);
+error_unlock:
+	*tpp = NULL;
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
 }
 
 int
-- 
cgit v0.10.2


From f4586e40613a9f8bb9f7f9c8a796062a9ab1614c Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:25 +1000
Subject: [XFS] Clean up xfs_alloc_search_busy() return values.

xfs_alloc_search_busy() returns an index into the busy array if the extent
was found in the array. This is never checked, and the
xfs_alloc_search_busy() does a log force to prevent reuse of the extent
before the free transaction hits the disk. Hence the return value is
useless. Declare the function void and remove the slot number from the
tracing as well.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30796a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bd5c017..bd43f77 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agnumber_t agno,
 		    xfs_agblock_t bno,
@@ -64,15 +64,15 @@ ktrace_t *xfs_alloc_trace_buf;
 	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
 #define	TRACE_UNBUSY(__func__,s,ag,sl,tp)	\
 	xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define	TRACE_ALLOC(s,a)
 #define	TRACE_FREE(s,a,b,x,f)
 #define	TRACE_MODAGF(s,a,f)
 #define	TRACE_BUSY(s,a,ag,agb,l,sl,tp)
 #define	TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
 #endif	/* XFS_ALLOC_TRACE */
 
 /*
@@ -2562,9 +2562,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 
 
 /*
- * returns non-zero if any of (agno,bno):len is in a busy list
+ * If we find the extent in the busy list, force the log out to get the
+ * extent out of the busy list so the caller can use it straight away.
  */
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agnumber_t agno,
 		    xfs_agblock_t bno,
@@ -2572,7 +2573,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 {
 	xfs_mount_t		*mp;
 	xfs_perag_busy_t	*bsy;
-	int			n;
 	xfs_agblock_t		uend, bend;
 	xfs_lsn_t		lsn;
 	int			cnt;
@@ -2585,21 +2585,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	uend = bno + len - 1;
 
 	/* search pagb_list for this slot, skipping open slots */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
-	     cnt; bsy++, n++) {
+	for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
 
 		/*
 		 * (start1,length1) within (start2, length2)
 		 */
 		if (bsy->busy_tp != NULL) {
 			bend = bsy->busy_start + bsy->busy_length - 1;
-			if ((bno > bend) ||
-			    (uend < bsy->busy_start)) {
+			if ((bno > bend) || (uend < bsy->busy_start)) {
 				cnt--;
 			} else {
 				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-						 "found1", agno, bno, len, n,
-						 tp);
+					 "found1", agno, bno, len, tp);
 				break;
 			}
 		}
@@ -2610,15 +2607,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	 * transaction that freed the block
 	 */
 	if (cnt) {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
 		lsn = bsy->busy_tp->t_commit_lsn;
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
 	} else {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
-		n = -1;
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 	}
-
-	return n;
 }
-- 
cgit v0.10.2


From 12375c82375ec39ec948a3ad62e5e77533515e83 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:32 +1000
Subject: [XFS] Make xfs_alloc_compute_aligned() void.

xfs_alloc_compute_aligned() returns a value based on a comparison of the
computed extent length and the minimum length allowed. This is only used
by some callers - the other four return parameters are used more often.
Hence move the comparison to the code that actually needs to do it and
make xfs_alloc_compute_aligned() a void function.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30797a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bd43f77..facdae1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC int				/* success (>= minlen) */
+STATIC void
 xfs_alloc_compute_aligned(
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
 	}
 	*resbno = bno;
 	*reslen = len;
-	return len >= minlen;
 }
 
 /*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (!xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+					args->minlen, &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+					args->minlen, &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
 				break;
 			if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
 				goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(gtbno, gtlen,
-					args->alignment, args->minlen,
-					&gtbnoa, &gtlena))
+			xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+					args->minlen, &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
 				break;
 			if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
 				goto error0;
-- 
cgit v0.10.2


From c2b1cba6833da77b1b478ac144f9cf5144d276ec Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:40 +1000
Subject: [XFS] xfs_bmap_adjacent() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30798a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 65b8fa8..6d9b5448 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
 
 #define XFS_ALLOC_GAP_UNITS	4
 
-STATIC int
+STATIC void
 xfs_bmap_adjacent(
 	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
 {
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
 			ap->rval = gotbno;
 	}
 #undef ISVALID
-	return 0;
 }
 
 STATIC int
-- 
cgit v0.10.2


From d87dd6360dce86cad9099aed74f14b4dd0143301 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:46 +1000
Subject: [XFS] Warn if errors come from block_truncate_page().

block_truncate_page() can return errors that we currently ignore and
silently discard. We should not ever get errors reported here - an error
indicates a bug somewhere else. Hence catch the error and issue a stack
dump to the syslog because we cannot propagate the error any further up
the call chain.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30800a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 53f8feb..41e7baa 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -692,11 +692,19 @@ xfs_vn_setattr(
 	return -error;
 }
 
+/*
+ * block_truncate_page can return an error, but we can't propagate it
+ * at all here. Leave a complaint + stack trace in the syslog because
+ * this could be bad. If it is bad, we need to propagate the error further.
+ */
 STATIC void
 xfs_vn_truncate(
 	struct inode	*inode)
 {
-	block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks);
+	int	error;
+	error = block_truncate_page(inode->i_mapping, inode->i_size,
+							xfs_get_blocks);
+	WARN_ON(error);
 }
 
 STATIC int
-- 
cgit v0.10.2


From fc6149d8d9634814cdcd9283b8f2efd3359181df Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:53 +1000
Subject: [XFS] Check for xfs_free_extent() failing.

xfs_free_extent() can fail, but log recovery never bothers to check if it
successfully free the extent it was supposed to. This could lead to silent
corruption during log recovery. Abort log recovery if we fail to free an
extent.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30801a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 418582b..3a8fe7b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3003,15 +3003,15 @@ xlog_recover_process_efi(
 
 	tp = xfs_trans_alloc(mp, 0);
 	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return error;
-	}
+	if (error)
+		goto abort_error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
-		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		if (error)
+			goto abort_error;
 		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
 					 extp->ext_len);
 	}
@@ -3019,6 +3019,10 @@ xlog_recover_process_efi(
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	error = xfs_trans_commit(tp, 0);
 	return error;
+
+abort_error:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
 }
 
 /*
-- 
cgit v0.10.2


From 7c9ef85c5672ae316aafd7bbe0bbadebe90301e6 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:59 +1000
Subject: [XFS] Catch errors returned from xfs_bmap_last_offset().

xfs_bmap_last_offset() can fail and return an error.
xfs_iomap_write_allocate() fails to detect and propagate the error.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30802a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f8..fb3cf11 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
 			 */
 			nimaps = 1;
 			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
-			xfs_bmap_last_offset(NULL, ip, &last_block,
-				XFS_DATA_FORK);
+			error = xfs_bmap_last_offset(NULL, ip, &last_block,
+							XFS_DATA_FORK);
+			if (error)
+				goto trans_cancel;
+
 			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
 			if ((map_start_fsb + count_fsb) > last_block) {
 				count_fsb = last_block - map_start_fsb;
-- 
cgit v0.10.2


From 556b8b166c9514b5f940047a41dad8fe8cd9a778 Mon Sep 17 00:00:00 2001
From: Barry Naujok <bnaujok@sgi.com>
Date: Thu, 10 Apr 2008 12:22:07 +1000
Subject: [XFS] remove bhv_vname_t and xfs_rename code

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30804a

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 66a9a9e..265f016 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
@@ -30,8 +31,6 @@
 #include "xfs_inode.h"
 #include "xfs_vfsops.h"
 
-static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
-
 /*
  * Note that we only accept fileids which are long enough rather than allow
  * the parent generation number to default to zero.  XFS considers zero a
@@ -216,7 +215,7 @@ xfs_fs_get_parent(
 	struct xfs_inode	*cip;
 	struct dentry		*parent;
 
-	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cip);
+	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 41e7baa..0c958cf 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -239,6 +239,15 @@ xfs_init_security(
 	return error;
 }
 
+static void
+xfs_dentry_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+}
+
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
@@ -246,20 +255,19 @@ xfs_cleanup_inode(
 	struct dentry	*dentry,
 	int		mode)
 {
-	struct dentry   teardown = {};
+	struct xfs_name	teardown;
 
 	/* Oh, the horror.
 	 * If we can't add the ACL or we fail in
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	teardown.d_inode = inode;
-	teardown.d_name = dentry->d_name;
+	xfs_dentry_to_name(&teardown, dentry);
 
 	if (S_ISDIR(mode))
-		xfs_rmdir(XFS_I(dir), &teardown);
+		xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
 	else
-		xfs_remove(XFS_I(dir), &teardown);
+		xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 	iput(inode);
 }
 
@@ -273,6 +281,7 @@ xfs_vn_mknod(
 	struct inode	*inode;
 	struct xfs_inode *ip = NULL;
 	xfs_acl_t	*default_acl = NULL;
+	struct xfs_name	name;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
 
@@ -293,6 +302,8 @@ xfs_vn_mknod(
 		}
 	}
 
+	xfs_dentry_to_name(&name, dentry);
+
 	if (IS_POSIXACL(dir) && !default_acl)
 		mode &= ~current->fs->umask;
 
@@ -303,10 +314,10 @@ xfs_vn_mknod(
 	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
-		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &ip, NULL);
+		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
 		break;
 	case S_IFDIR:
-		error = xfs_mkdir(XFS_I(dir), dentry, mode, &ip, NULL);
+		error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -371,12 +382,14 @@ xfs_vn_lookup(
 	struct nameidata *nd)
 {
 	struct xfs_inode *cip;
+	struct xfs_name	name;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	error = xfs_lookup(XFS_I(dir), dentry, &cip);
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_lookup(XFS_I(dir), &name, &cip);
 	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
@@ -394,12 +407,14 @@ xfs_vn_link(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;	/* inode of guy being linked to */
+	struct xfs_name	name;
 	int		error;
 
 	inode = old_dentry->d_inode;
+	xfs_dentry_to_name(&name, dentry);
 
 	igrab(inode);
-	error = xfs_link(XFS_I(dir), XFS_I(inode), dentry);
+	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
 	if (unlikely(error)) {
 		iput(inode);
 		return -error;
@@ -417,11 +432,13 @@ xfs_vn_unlink(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;
+	struct xfs_name	name;
 	int		error;
 
 	inode = dentry->d_inode;
+	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_remove(XFS_I(dir), dentry);
+	error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
 	if (likely(!error)) {
 		xfs_validate_fields(dir);	/* size needs update */
 		xfs_validate_fields(inode);
@@ -437,14 +454,15 @@ xfs_vn_symlink(
 {
 	struct inode	*inode;
 	struct xfs_inode *cip = NULL;
+	struct xfs_name	name;
 	int		error;
 	mode_t		mode;
 
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
-			    &cip, NULL);
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
 	if (unlikely(error))
 		goto out;
 
@@ -471,9 +489,12 @@ xfs_vn_rmdir(
 	struct dentry	*dentry)
 {
 	struct inode	*inode = dentry->d_inode;
+	struct xfs_name	name;
 	int		error;
 
-	error = xfs_rmdir(XFS_I(dir), dentry);
+	xfs_dentry_to_name(&name, dentry);
+
+	error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
 	if (likely(!error)) {
 		xfs_validate_fields(inode);
 		xfs_validate_fields(dir);
@@ -489,9 +510,15 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
+	struct xfs_name	oname;
+	struct xfs_name	nname;
 	int		error;
 
-	error = xfs_rename(XFS_I(odir), odentry, XFS_I(ndir), ndentry);
+	xfs_dentry_to_name(&oname, odentry);
+	xfs_dentry_to_name(&nname, ndentry);
+
+	error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+							XFS_I(ndir), &nname);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index dbb8a5d..8b4d63c 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-typedef struct dentry	bhv_vname_t;
-typedef __u64		bhv_vnumber_t;
 typedef struct inode	bhv_vnode_t;
 
 #define VN_ISLNK(vp)	S_ISLNK((vp)->i_mode)
@@ -211,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
 }
 
 /*
- * Vname handling macros.
- */
-#define VNAME(dentry)		((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry)	((dentry)->d_name.len)
-#define VNAME_TO_INODE(dentry)	(XFS_I((dentry)->d_inode))
-
-/*
  * Dealing with bad inodes
  */
 static inline int VN_BAD(bhv_vnode_t *vp)
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index e92e73f..7cb2652 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
 
+struct xfs_name xfs_name_dotdot = {"..", 2};
 
 void
 xfs_dir_mount(
@@ -146,8 +147,7 @@ int
 xfs_dir_createname(
 	xfs_trans_t		*tp,
 	xfs_inode_t		*dp,
-	char			*name,
-	int			namelen,
+	struct xfs_name		*name,
 	xfs_ino_t		inum,		/* new entry inode number */
 	xfs_fsblock_t		*first,		/* bmap's firstblock */
 	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
 		return rval;
 	XFS_STATS_INC(xs_dir_create);
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -197,8 +197,7 @@ int
 xfs_dir_lookup(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,
-	int		namelen,
+	struct xfs_name	*name,
 	xfs_ino_t	*inum)		/* out: inode number */
 {
 	xfs_da_args_t	args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_lookup);
+	memset(&args, 0, sizeof(xfs_da_args_t));
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = args.addname = 0;
 	args.oknoent = 1;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
 xfs_dir_removename(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,
-	int		namelen,
+	struct xfs_name	*name,
 	xfs_ino_t	ino,
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_remove);
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = ino;
 	args.dp = dp;
 	args.firstblock = first;
@@ -329,8 +323,7 @@ int
 xfs_dir_replace(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,		/* name of entry to replace */
-	int		namelen,
+	struct xfs_name	*name,		/* name of entry to replace */
 	xfs_ino_t	inum,		/* new inode number */
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
 	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
 
 /*
  * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
  */
 int
 xfs_dir_canenter(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,		/* name of entry to add */
-	int		namelen)
+	struct xfs_name	*name,		/* name of entry to add */
+	uint		resblks)
 {
 	xfs_da_args_t	args;
 	int		rval;
 	int		v;		/* type-checking value */
 
+	if (resblks)
+		return 0;
+
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	memset(&args, 0, sizeof(xfs_da_args_t));
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197..6392f93 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef	__uint32_t	xfs_dir2_db_t;
  */
 typedef	xfs_off_t	xfs_dir2_off_t;
 
+extern struct xfs_name	xfs_name_dotdot;
+
 /*
  * Generic directory interface routines
  */
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
 				struct xfs_inode *pdp);
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t inum,
+				struct xfs_name *name, xfs_ino_t inum,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t *inum);
+				struct xfs_name *name, xfs_ino_t *inum);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t ino,
+				struct xfs_name *name, xfs_ino_t ino,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t inum,
+				struct xfs_name *name, xfs_ino_t inum,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen);
+				struct xfs_name *name, uint resblks);
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 77b39f6..1ed5751 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -73,7 +73,7 @@ typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
 			struct xfs_inode *, dm_right_t,
 			struct xfs_inode *, dm_right_t,
-			char *, char *, mode_t, int, int);
+			const char *, const char *, mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
 typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
 
 /*
  * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used. 
+ * "largeio" mount option is used.
  *
  * If compatibility mode is specified, simply return the basic unit of caching
  * so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c4d0bac..ee37189 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -83,26 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
  */
 STATIC int
 xfs_lock_for_rename(
-	xfs_inode_t	*dp1,	/* old (source) directory inode */
-	xfs_inode_t	*dp2,	/* new (target) directory inode */
-	bhv_vname_t	*vname1,/* old entry name */
-	bhv_vname_t	*vname2,/* new entry name */
-	xfs_inode_t	**ipp1,	/* inode of old entry */
-	xfs_inode_t	**ipp2,	/* inode of new entry, if it
+	xfs_inode_t	*dp1,	/* in: old (source) directory inode */
+	xfs_inode_t	*dp2,	/* in: new (target) directory inode */
+	xfs_inode_t	*ip1,	/* in: inode of old entry */
+	struct xfs_name	*name2,	/* in: new entry name */
+	xfs_inode_t	**ipp2,	/* out: inode of new entry, if it
 				   already exists, NULL otherwise. */
-	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
-	int		*num_inodes)  /* number of inodes in array */
+	xfs_inode_t	**i_tab,/* out: array of inode returned, sorted */
+	int		*num_inodes)  /* out: number of inodes in array */
 {
-	xfs_inode_t		*ip1 = VNAME_TO_INODE(vname1);
-	xfs_inode_t		*ip2, *temp;
+	xfs_inode_t		*ip2 = NULL;
+	xfs_inode_t		*temp;
 	xfs_ino_t		inum1, inum2;
 	int			error;
 	int			i, j;
 	uint			lock_mode;
 	int			diff_dirs = (dp1 != dp2);
 
-	ip2 = NULL;
-
 	/*
 	 * First, find out the current inums of the entries so that we
 	 * can determine the initial locking order.  We'll have to
@@ -115,17 +112,15 @@ xfs_lock_for_rename(
 
 	inum1 = ip1->i_ino;
 
-
 	/*
 	 * Unlock dp1 and lock dp2 if they are different.
 	 */
-
 	if (diff_dirs) {
 		xfs_iunlock_map_shared(dp1, lock_mode);
 		lock_mode = xfs_ilock_map_shared(dp2);
 	}
 
-	error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2);
+	error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
 	if (error == ENOENT) {		/* target does not need to exist. */
 		inum2 = 0;
 	} else if (error) {
@@ -157,6 +152,7 @@ xfs_lock_for_rename(
 		*num_inodes = 4;
 		i_tab[3] = ip2;
 	}
+	*ipp2 = i_tab[3];
 
 	/*
 	 * Sort the elements via bubble sort.  (Remember, there are at
@@ -194,21 +190,6 @@ xfs_lock_for_rename(
 		xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
 	}
 
-	/*
-	 * Set the return value. Null out any unused entries in i_tab.
-	 */
-	*ipp1 = *ipp2 = NULL;
-	for (i=0; i < *num_inodes; i++) {
-		if (i_tab[i]->i_ino == inum1) {
-			*ipp1 = i_tab[i];
-		}
-		if (i_tab[i]->i_ino == inum2) {
-			*ipp2 = i_tab[i];
-		}
-	}
-	for (;i < 4; i++) {
-		i_tab[i] = NULL;
-	}
 	return 0;
 }
 
@@ -218,12 +199,13 @@ xfs_lock_for_rename(
 int
 xfs_rename(
 	xfs_inode_t	*src_dp,
-	bhv_vname_t	*src_vname,
+	struct xfs_name	*src_name,
+	xfs_inode_t	*src_ip,
 	xfs_inode_t	*target_dp,
-	bhv_vname_t	*target_vname)
+	struct xfs_name	*target_name)
 {
 	xfs_trans_t	*tp;
-	xfs_inode_t	*src_ip, *target_ip;
+	xfs_inode_t	*target_ip;
 	xfs_mount_t	*mp = src_dp->i_mount;
 	int		new_parent;		/* moving to a new dir */
 	int		src_is_directory;	/* src_name is a directory */
@@ -237,10 +219,6 @@ xfs_rename(
 	int		spaceres;
 	int		target_link_zero = 0;
 	int		num_inodes;
-	char		*src_name = VNAME(src_vname);
-	char		*target_name = VNAME(target_vname);
-	int		src_namelen = VNAMELEN(src_vname);
-	int		target_namelen = VNAMELEN(target_vname);
 
 	xfs_itrace_entry(src_dp);
 	xfs_itrace_entry(target_dp);
@@ -250,7 +228,7 @@ xfs_rename(
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
 					src_dp, DM_RIGHT_NULL,
 					target_dp, DM_RIGHT_NULL,
-					src_name, target_name,
+					src_name->name, target_name->name,
 					0, 0, 0);
 		if (error) {
 			return error;
@@ -267,10 +245,8 @@ xfs_rename(
 	 * does not exist in the source directory.
 	 */
 	tp = NULL;
-	error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
-			target_vname, &src_ip, &target_ip, inodes,
-			&num_inodes);
-
+	error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
+					&target_ip, inodes, &num_inodes);
 	if (error) {
 		/*
 		 * We have nothing locked, no inode references, and
@@ -316,7 +292,7 @@ xfs_rename(
 	XFS_BMAP_INIT(&free_list, &first_block);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -374,9 +350,8 @@ xfs_rename(
 		 * If there's no space reservation, check the entry will
 		 * fit before actually inserting it.
 		 */
-		if (spaceres == 0 &&
-		    (error = xfs_dir_canenter(tp, target_dp, target_name,
-						target_namelen)))
+		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+		if (error)
 			goto error_return;
 		/*
 		 * If target does not exist and the rename crosses
@@ -384,8 +359,8 @@ xfs_rename(
 		 * to account for the ".." reference from the new entry.
 		 */
 		error = xfs_dir_createname(tp, target_dp, target_name,
-					   target_namelen, src_ip->i_ino,
-					   &first_block, &free_list, spaceres);
+						src_ip->i_ino, &first_block,
+						&free_list, spaceres);
 		if (error == ENOSPC)
 			goto error_return;
 		if (error)
@@ -424,7 +399,7 @@ xfs_rename(
 		 * name at the destination directory, remove it first.
 		 */
 		error = xfs_dir_replace(tp, target_dp, target_name,
-					target_namelen, src_ip->i_ino,
+					src_ip->i_ino,
 					&first_block, &free_list, spaceres);
 		if (error)
 			goto abort_return;
@@ -461,7 +436,8 @@ xfs_rename(
 		 * Rewrite the ".." entry to point to the new
 		 * directory.
 		 */
-		error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino,
 					&first_block, &free_list, spaceres);
 		ASSERT(error != EEXIST);
 		if (error)
@@ -497,8 +473,8 @@ xfs_rename(
 			goto abort_return;
 	}
 
-	error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
-			src_ip->i_ino, &first_block, &free_list, spaceres);
+	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					&first_block, &free_list, spaceres);
 	if (error)
 		goto abort_return;
 	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -583,7 +559,7 @@ std_return:
 		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
 					src_dp, DM_RIGHT_NULL,
 					target_dp, DM_RIGHT_NULL,
-					src_name, target_name,
+					src_name->name, target_name->name,
 					0, error, 0);
 	}
 	return error;
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be4..0f51916 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
 	XFS_BTNUM_MAX
 } xfs_btnum_t;
 
+struct xfs_name {
+	const char	*name;
+	int		len;
+};
+
 #endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 47c45ff..2b8dc7e 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -45,7 +45,7 @@ int
 xfs_dir_lookup_int(
 	xfs_inode_t	*dp,
 	uint		lock_mode,
-	bhv_vname_t	*dentry,
+	struct xfs_name	*name,
 	xfs_ino_t	*inum,
 	xfs_inode_t	**ipp)
 {
@@ -53,7 +53,7 @@ xfs_dir_lookup_int(
 
 	xfs_itrace_entry(dp);
 
-	error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
+	error = xfs_dir_lookup(NULL, dp, name, inum);
 	if (!error) {
 		/*
 		 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 701accb..175b126 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,8 +21,8 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_dir_lookup_int(xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
-				xfs_inode_t **);
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
+				xfs_ino_t *, xfs_inode_t **);
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index bc0a470..ca38fb9 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1764,7 +1764,7 @@ xfs_inactive(
 int
 xfs_lookup(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*name,
 	xfs_inode_t		**ipp)
 {
 	xfs_inode_t		*ip;
@@ -1778,7 +1778,7 @@ xfs_lookup(
 		return XFS_ERROR(EIO);
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
+	error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
 	if (!error) {
 		*ipp = ip;
 		xfs_itrace_ref(ip);
@@ -1790,17 +1790,16 @@ xfs_lookup(
 int
 xfs_create(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*name,
 	mode_t			mode,
 	xfs_dev_t		rdev,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	char			*name = VNAME(dentry);
-	xfs_mount_t	        *mp = dp->i_mount;
+	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*ip;
 	xfs_trans_t		*tp;
-	int                     error;
+	int			error;
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	boolean_t		unlock_dp_on_error = B_FALSE;
@@ -1810,17 +1809,14 @@ xfs_create(
 	xfs_prid_t		prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
-	int			namelen;
 
 	ASSERT(!*ipp);
 	xfs_itrace_entry(dp);
 
-	namelen = VNAMELEN(dentry);
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
 				dp, DM_RIGHT_NULL, NULL,
-				DM_RIGHT_NULL, name, NULL,
+				DM_RIGHT_NULL, name->name, NULL,
 				mode, 0, 0);
 
 		if (error)
@@ -1852,7 +1848,7 @@ xfs_create(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 	/*
 	 * Initially assume that the file does not exist and
 	 * reserve the resources for that case.  If that is not
@@ -1885,7 +1881,8 @@ xfs_create(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
+	error = xfs_dir_canenter(tp, dp, name, resblks);
+	if (error)
 		goto error_return;
 	error = xfs_dir_ialloc(&tp, dp, mode, 1,
 			rdev, credp, prid, resblks > 0,
@@ -1915,7 +1912,7 @@ xfs_create(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
-	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks ?
 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
@@ -1976,7 +1973,7 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
 			dp, DM_RIGHT_NULL,
 			*ipp ? ip : NULL,
-			DM_RIGHT_NULL, name, NULL,
+			DM_RIGHT_NULL, name->name, NULL,
 			mode, error, 0);
 	}
 	return error;
@@ -2268,12 +2265,10 @@ int remove_which_error_return = 0;
 int
 xfs_remove(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*name,
+	xfs_inode_t		*ip)
 {
-	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
-	int			namelen = VNAMELEN(dentry);
 	xfs_trans_t             *tp = NULL;
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
@@ -2289,9 +2284,9 @@ xfs_remove(
 		return XFS_ERROR(EIO);
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp,
-					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					name, NULL, ip->i_d.di_mode, 0, 0);
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL, name->name, NULL,
+					ip->i_d.di_mode, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2376,7 +2371,7 @@ xfs_remove(
 	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
 	 */
 	XFS_BMAP_INIT(&free_list, &first_block);
-	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, 0);
 	if (error) {
 		ASSERT(error != ENOENT);
@@ -2444,7 +2439,7 @@ xfs_remove(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 				dp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
-				name, NULL, ip->i_d.di_mode, error, 0);
+				name->name, NULL, ip->i_d.di_mode, error, 0);
 	}
 	return error;
 
@@ -2474,7 +2469,7 @@ int
 xfs_link(
 	xfs_inode_t		*tdp,
 	xfs_inode_t		*sip,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*target_name)
 {
 	xfs_mount_t		*mp = tdp->i_mount;
 	xfs_trans_t		*tp;
@@ -2485,13 +2480,10 @@ xfs_link(
 	int			cancel_flags;
 	int			committed;
 	int			resblks;
-	char			*target_name = VNAME(dentry);
-	int			target_namelen;
 
 	xfs_itrace_entry(tdp);
 	xfs_itrace_entry(sip);
 
-	target_namelen = VNAMELEN(dentry);
 	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -2501,7 +2493,7 @@ xfs_link(
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
 					tdp, DM_RIGHT_NULL,
 					sip, DM_RIGHT_NULL,
-					target_name, NULL, 0, 0, 0);
+					target_name->name, NULL, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2516,7 +2508,7 @@ xfs_link(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -2568,15 +2560,14 @@ xfs_link(
 		goto error_return;
 	}
 
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
+	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+	if (error)
 		goto error_return;
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
-				   sip->i_ino, &first_block, &free_list,
-				   resblks);
+	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+					&first_block, &free_list, resblks);
 	if (error)
 		goto abort_return;
 	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2612,7 +2603,7 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
 				tdp, DM_RIGHT_NULL,
 				sip, DM_RIGHT_NULL,
-				target_name, NULL, 0, error, 0);
+				target_name->name, NULL, 0, error, 0);
 	}
 	return error;
 
@@ -2629,13 +2620,11 @@ std_return:
 int
 xfs_mkdir(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*dir_name,
 	mode_t			mode,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	char			*dir_name = VNAME(dentry);
-	int			dir_namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*cdp;	/* inode of created dir */
 	xfs_trans_t		*tp;
@@ -2659,7 +2648,7 @@ xfs_mkdir(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
 					dp, DM_RIGHT_NULL, NULL,
-					DM_RIGHT_NULL, dir_name, NULL,
+					DM_RIGHT_NULL, dir_name->name, NULL,
 					mode, 0, 0);
 		if (error)
 			return error;
@@ -2688,7 +2677,7 @@ xfs_mkdir(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+	resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
 	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
 				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -2720,8 +2709,8 @@ xfs_mkdir(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
+	error = xfs_dir_canenter(tp, dp, dir_name, resblks);
+	if (error)
 		goto error_return;
 	/*
 	 * create the directory inode.
@@ -2750,9 +2739,9 @@ xfs_mkdir(
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
-				   &first_block, &free_list, resblks ?
-				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto error1;
@@ -2817,7 +2806,7 @@ std_return:
 					dp, DM_RIGHT_NULL,
 					created ? cdp : NULL,
 					DM_RIGHT_NULL,
-					dir_name, NULL,
+					dir_name->name, NULL,
 					mode, error, 0);
 	}
 	return error;
@@ -2841,13 +2830,11 @@ std_return:
 int
 xfs_rmdir(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*name,
+	xfs_inode_t		*cdp)
 {
 	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
-	char			*name = VNAME(dentry);
-	int			namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-  	xfs_inode_t             *cdp = VNAME_TO_INODE(dentry);
 	xfs_trans_t             *tp;
 	int                     error;
 	xfs_bmap_free_t         free_list;
@@ -2865,8 +2852,8 @@ xfs_rmdir(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
 					dp, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL,
-					name, NULL, cdp->i_d.di_mode, 0, 0);
+					NULL, DM_RIGHT_NULL, name->name,
+					NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
 			return XFS_ERROR(error);
 	}
@@ -2960,7 +2947,7 @@ xfs_rmdir(
 		goto error_return;
 	}
 
-	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+	error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
 					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
@@ -3040,7 +3027,7 @@ xfs_rmdir(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, cdp->i_d.di_mode,
+					name->name, NULL, cdp->i_d.di_mode,
 					error, 0);
 	}
 	return error;
@@ -3058,8 +3045,8 @@ xfs_rmdir(
 int
 xfs_symlink(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
-	char			*target_path,
+	struct xfs_name		*link_name,
+	const char		*target_path,
 	mode_t			mode,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
@@ -3079,15 +3066,13 @@ xfs_symlink(
 	int			nmaps;
 	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
 	xfs_daddr_t		d;
-	char			*cur_chunk;
+	const char		*cur_chunk;
 	int			byte_cnt;
 	int			n;
 	xfs_buf_t		*bp;
 	xfs_prid_t		prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
-	char			*link_name = VNAME(dentry);
-	int			link_namelen;
 
 	*ipp = NULL;
 	error = 0;
@@ -3099,8 +3084,6 @@ xfs_symlink(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	link_namelen = VNAMELEN(dentry);
-
 	/*
 	 * Check component lengths of the target path name.
 	 */
@@ -3111,7 +3094,7 @@ xfs_symlink(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					link_name, target_path, 0, 0, 0);
+					link_name->name, target_path, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -3143,7 +3126,7 @@ xfs_symlink(
 		fs_blocks = 0;
 	else
 		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
-	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
 	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
 	if (error == ENOSPC && fs_blocks == 0) {
@@ -3177,8 +3160,8 @@ xfs_symlink(
 	/*
 	 * Check for ability to enter directory entry, if no space reserved.
 	 */
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
+	error = xfs_dir_canenter(tp, dp, link_name, resblks);
+	if (error)
 		goto error_return;
 	/*
 	 * Initialize the bmap freelist prior to calling either
@@ -3270,8 +3253,8 @@ xfs_symlink(
 	/*
 	 * Create the directory entry for the symlink.
 	 */
-	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
-				   &first_block, &free_list, resblks);
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3315,8 +3298,8 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
 					dp, DM_RIGHT_NULL,
 					error ? NULL : ip,
-					DM_RIGHT_NULL, link_name, target_path,
-					0, error, 0);
+					DM_RIGHT_NULL, link_name->name,
+					target_path, 0, error, 0);
 	}
 
 	if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 12e5818..24c5392 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,20 +23,22 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
 		xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp);
-int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
-int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-		bhv_vname_t *dentry);
-int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
+		struct xfs_name *target_name);
+int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
 		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
+int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
-		char *target_path, mode_t mode, struct xfs_inode **ipp,
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+		const char *target_path, mode_t mode, struct xfs_inode **ipp,
 		struct cred *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
@@ -44,8 +46,9 @@ int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
 		struct cred *credp, int	attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
-		struct xfs_inode *target_dp, bhv_vname_t *target_vname);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+		struct xfs_name *target_name);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
 		int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-- 
cgit v0.10.2


From d64e31a2f53cdcb2f95b782196faacb0995ca0c0 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:22:17 +1000
Subject: [XFS] Ensure errors from xfs_bdstrat() are correctly checked.

xfsbdstrat() is declared to return an error. That is never checked because
the error is propagated by the xfs_buf_t that is passed through the
function.

Mark xfsbdstrat() as returning void and comment the prototype on the
methods needed for error checking.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30823a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1d95dca..f6dab5d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -875,28 +875,21 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
 }
 
 /*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
  */
-int
+void
 xfsbdstrat(
 	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		/* Grio redirection would go here
-		 * if (XFS_BUF_IS_GRIO(bp)) {
-		 */
-
+	if (!XFS_FORCED_SHUTDOWN(mp))
 		xfs_buf_iorequest(bp);
-		return 0;
-	}
 
 	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-	return (xfs_bioerror_relse(bp));
+	xfs_bioerror_relse(bp);
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253..e1d498b 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
 #define xfs_inval_cached_trace(ip, offset, len, first, last)
 #endif
 
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+/* errors from xfsbdstrat() must be extracted from the buffer */
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3a8fe7b..1f83298 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -121,7 +121,8 @@ xlog_bread(
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 
 	xfsbdstrat(log->l_mp, bp);
-	if ((error = xfs_iowait(bp)))
+	error = xfs_iowait(bp);
+	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
@@ -3849,7 +3850,8 @@ xlog_do_recover(
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
-	if ((error = xfs_iowait(bp))) {
+	error = xfs_iowait(bp);
+	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
 		ASSERT(0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2d03fe1..2fec452 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1470,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
 		xfsbdstrat(mp, sbp);
-		/* Nevermind errors we might get here. */
 		error = xfs_iowait(sbp);
 		if (error)
 			xfs_ioerror_alert("xfs_unmountfs_writesb",
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e5c010..cb0c583 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -354,17 +354,15 @@ xfs_trans_read_buf(
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			XFS_BUF_READ(bp);
 			xfsbdstrat(tp->t_mountp, bp);
-			xfs_iowait(bp);
-			if (XFS_BUF_GETERROR(bp) != 0) {
+			error = xfs_iowait(bp);
+			if (error) {
 				xfs_ioerror_alert("xfs_trans_read_buf", mp,
 						  bp, blkno);
-				error = XFS_BUF_GETERROR(bp);
 				xfs_buf_relse(bp);
 				/*
-				 * We can gracefully recover from most
-				 * read errors. Ones we can't are those
-				 * that happen after the transaction's
-				 * already dirty.
+				 * We can gracefully recover from most read
+				 * errors. Ones we can't are those that happen
+				 * after the transaction's already dirty.
 				 */
 				if (tp->t_flags & XFS_TRANS_DIRTY)
 					xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ca38fb9..dd4621e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3825,7 +3825,8 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
-		if ((error = xfs_iowait(bp))) {
+		error = xfs_iowait(bp);
+		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
 					  mp, bp, XFS_BUF_ADDR(bp));
 			break;
@@ -3837,7 +3838,8 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
 		xfsbdstrat(mp, bp);
-		if ((error = xfs_iowait(bp))) {
+		error = xfs_iowait(bp);
+		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
 					  mp, bp, XFS_BUF_ADDR(bp));
 			break;
-- 
cgit v0.10.2


From db7a19f2c89d99b66874a7e0c0dc681ff1f37b4e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:22:24 +1000
Subject: [XFS] Ensure xfs_bawrite() errors are checked.

xfs_bawrite() can return immediate error status on async writes. Unlike
xfsbdstrat() we don't ever check the error on the buffer after the call,
so we currently do not catch errors at all here. Ensure we catch and
propagate or warn to the syslog about up-front async write errors.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30824a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 15214fb..631ebb3 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
 	if (flags & XFS_QMOPT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & XFS_QMOPT_ASYNC) {
-		xfs_bawrite(mp, bp);
+		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
@@ -1582,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
 		    XFS_INCORE_TRYLOCK);
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			int	error;
 			if (XFS_BUF_ISPINNED(bp)) {
 				xfs_log_force(dqp->q_mount,
 					      (xfs_lsn_t)0,
 					      XFS_LOG_FORCE);
 			}
-			xfs_bawrite(dqp->q_mount, bp);
+			error = xfs_bawrite(dqp->q_mount, bp);
+			if (error)
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+					"xfs_qm_dqflock_pushbuf_wait: "
+					"pushbuf error %d on dqp %p, bp %p",
+					error, dqp, bp);
 		} else {
 			xfs_buf_relse(bp);
 		}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 3dedce1..36e05ca 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -267,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
 					      XFS_LOG_FORCE);
 			}
 			if (dopush) {
+				int	error;
 #ifdef XFSRACEDEBUG
 				delay_for_intr();
 				delay(300);
 #endif
-				xfs_bawrite(mp, bp);
+				error = xfs_bawrite(mp, bp);
+				if (error)
+					xfs_fs_cmn_err(CE_WARN, mp,
+	"xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
+							error, qip, bp);
 			} else {
 				xfs_buf_relse(bp);
 			}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd1..53a71c6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
 	bp = bip->bli_buf;
 
 	if (XFS_BUF_ISDELAYWRITE(bp)) {
-		xfs_bawrite(bip->bli_item.li_mountp, bp);
+		int	error;
+		error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+		if (error)
+			xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+			"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+					error, bip, bp);
 	} else {
 		xfs_buf_relse(bp);
 	}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 63e6689..ca074ee 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3276,7 +3276,7 @@ xfs_iflush(
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & INT_ASYNC) {
-		xfs_bawrite(mp, bp);
+		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2c775b4..93b5db4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
+#include "xfs_error.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
 					      XFS_LOG_FORCE);
 			}
 			if (dopush) {
-				xfs_bawrite(mp, bp);
+				int	error;
+				error = xfs_bawrite(mp, bp);
+				if (error)
+					xfs_fs_cmn_err(CE_WARN, mp,
+		"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
+							error, iip, bp);
 			} else {
 				xfs_buf_relse(bp);
 			}
-- 
cgit v0.10.2


From 958d4ec606d4af590f86a601a238613f21e878ee Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:46 +1000
Subject: [XFS] xfs_bdwrite() does not return errors.

xfs_bdwrite() cannot return an error; it only queues buffers to the
delayed write list and as such never encounters anything that can fail.
Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30825a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 142ddbe..52f6846 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
 		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
 		bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
 		xfs_buf_delwri_queue(bp, 1);
-		return status;
+		return 0;
 	}
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207d..841d788 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 	return error;
 }
 
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+/*
+ * No error can be returned from xfs_buf_iostart for delwri
+ * buffers as they are queued and no I/O is issued.
+ */
+static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
 	bp->b_strat = xfs_bdstrat_cb;
 	bp->b_fspriv3 = mp;
-	return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
+	(void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
 }
 
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-- 
cgit v0.10.2


From cc88466f3f67bb16fc91b0b974e51c2a43a9e597 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:52 +1000
Subject: [XFS] Catch unwritten extent conversion errors.

On unwritten I/O completion, we fail to propagate an error when converting
the extent to a written extent. This means that the I/O silently fails.
propagate the error onto the ioend so that the inode is marked with an
error appropriately.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30826a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 169e6c0..a55c3b2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
 	size_t			size = ioend->io_size;
 
 	if (likely(!ioend->io_error)) {
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
-			xfs_iomap_write_unwritten(ip, offset, size);
+		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			int error;
+			error = xfs_iomap_write_unwritten(ip, offset, size);
+			if (error)
+				ioend->io_error = error;
+		}
 		xfs_setfilesize(ioend);
 	}
 	xfs_destroy_ioend(ioend);
-- 
cgit v0.10.2


From e4ac967b117c5780760abbd9ae996210c31cb398 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:58 +1000
Subject: [XFS] xfs_iflush_fork() never returns an error.

xfs_iflush_fork() never returns an error. Mark it void and clean up the
code calling it that checks for errors.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30827a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ca074ee..2bc2279 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2883,7 +2883,7 @@ xfs_iextents_copy(
  * format indicates the current state of the fork.
  */
 /*ARGSUSED*/
-STATIC int
+STATIC void
 xfs_iflush_fork(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip,
@@ -2904,16 +2904,16 @@ xfs_iflush_fork(
 	static const short	extflag[2] =
 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
 
-	if (iip == NULL)
-		return 0;
+	if (!iip)
+		return;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	/*
 	 * This can happen if we gave up in iformat in an error path,
 	 * for the attribute fork.
 	 */
-	if (ifp == NULL) {
+	if (!ifp) {
 		ASSERT(whichfork == XFS_ATTR_FORK);
-		return 0;
+		return;
 	}
 	cp = XFS_DFORK_PTR(dip, whichfork);
 	mp = ip->i_mount;
@@ -2974,8 +2974,6 @@ xfs_iflush_fork(
 		ASSERT(0);
 		break;
 	}
-
-	return 0;
 }
 
 STATIC int
@@ -3452,16 +3450,9 @@ xfs_iflush_int(
 		}
 	}
 
-	if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
-		goto corrupt_out;
-	}
-
-	if (XFS_IFORK_Q(ip)) {
-		/*
-		 * The only error from xfs_iflush_fork is on the data fork.
-		 */
-		(void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-	}
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+	if (XFS_IFORK_Q(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
 	xfs_inobp_check(mp, bp);
 
 	/*
-- 
cgit v0.10.2


From 7b07339048f7b020575706b492c004b5664b67ab Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:04 +1000
Subject: [XFS] xfs_bulkstat_one_dinode() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30828a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 45d8776..eb85bde 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
 	return error;
 }
 
-STATIC int
+STATIC void
 xfs_bulkstat_one_dinode(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
 	xfs_ino_t	ino,		/* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
 		buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
 		break;
 	}
-
-	return 0;
 }
 
 STATIC int
-- 
cgit v0.10.2


From 64bfe1bfae833e89ed77f72c61ded19f4b1976f8 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:10 +1000
Subject: [XFS] Catch errors from xfs_imap().

Catch errors from xfs_imap() in log recovery when we might be trying to
map an invalid inode number due to a corrupted log.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30829a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1f83298..a803943 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2282,7 +2282,9 @@ xlog_recover_do_inode_trans(
 		 * invalidate the buffer when we write it out below.
 		 */
 		imap.im_blkno = 0;
-		xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+		error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+		if (error)
+			goto error;
 	}
 
 	/*
-- 
cgit v0.10.2


From 78e9da77f1bf265fe750b9223ec15707473fb6e8 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:17 +1000
Subject: [XFS] Don't allow silent errors in xfs_inactive().

xfs_inactive() fails to report errors when committing the inactive
transaction. Hence we can get silent failures either finishing off the
truncation or committing the transaction. Even if we get errors, we need
to continue, so simply warn loudly to the system if we get errors here.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30830a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dd4621e..6650601 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1743,11 +1743,18 @@ xfs_inactive(
 		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 
 		/*
-		 * Just ignore errors at this point.  There is
-		 * nothing we can do except to try to keep going.
+		 * Just ignore errors at this point.  There is nothing we can
+		 * do except to try to keep going. Make sure it's not a silent
+		 * error.
 		 */
-		(void) xfs_bmap_finish(&tp,  &free_list, &committed);
-		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_bmap_finish(&tp,  &free_list, &committed);
+		if (error)
+			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+				"xfs_bmap_finish() returned error %d", error);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+				"xfs_trans_commit() returned error %d", error);
 	}
 	/*
 	 * Release the dquots held by inode, if any.
-- 
cgit v0.10.2


From 234f56aca20a4f66b6ba3d3bf2787634dd9e0999 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:24 +1000
Subject: [XFS] Check for errors when changing buffer pointers.

xfs_buf_associate_memory() can fail, but the return is never checked.
Propagate the error through XFS_BUF_SET_PTR() so that failures are
detected.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30831a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a803943..e65ab4a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1162,10 +1162,14 @@ xlog_write_log_records(
 		if (j == 0 && (start_block + endcount > ealign)) {
 			offset = XFS_BUF_PTR(bp);
 			balign = BBTOB(ealign - start_block);
-			XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
-			if ((error = xlog_bread(log, ealign, sectbb, bp)))
+			error = XFS_BUF_SET_PTR(bp, offset + balign,
+						BBTOB(sectbb));
+			if (!error)
+				error = xlog_bread(log, ealign, sectbb, bp);
+			if (!error)
+				error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+			if (error)
 				break;
-			XFS_BUF_SET_PTR(bp, offset, bufblks);
 		}
 
 		offset = xlog_align(log, start_block, endcount, bp);
@@ -3630,15 +3634,19 @@ xlog_do_recovery_pass(
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
+				wrapped_hblks = hblks - split_hblks;
 				bufaddr = XFS_BUF_PTR(hbp);
-				XFS_BUF_SET_PTR(hbp,
+				error = XFS_BUF_SET_PTR(hbp,
 						bufaddr + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
-				wrapped_hblks = hblks - split_hblks;
-				error = xlog_bread(log, 0, wrapped_hblks, hbp);
+				if (!error)
+					error = xlog_bread(log, 0,
+							wrapped_hblks, hbp);
+				if (!error)
+					error = XFS_BUF_SET_PTR(hbp, bufaddr,
+							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
-				XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
@@ -3690,13 +3698,18 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				bufaddr = XFS_BUF_PTR(dbp);
-				XFS_BUF_SET_PTR(dbp,
+				error = XFS_BUF_SET_PTR(dbp,
 						bufaddr + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
-				if ((error = xlog_bread(log, wrapped_hblks,
-						bblks - split_bblks, dbp)))
+				if (!error)
+					error = xlog_bread(log, wrapped_hblks,
+							bblks - split_bblks,
+							dbp);
+				if (!error)
+					error = XFS_BUF_SET_PTR(dbp, bufaddr,
+							h_size);
+				if (error)
 					goto bread_err2;
-				XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
-- 
cgit v0.10.2


From b911ca0472c3762d2bafc4d21e432a9056844064 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:30 +1000
Subject: [XFS] Sanitise xfs_log_force error checking.

xfs_log_force() is declared to return an error, but we almost never check
it. We don't need to check it in most cases; if there's a log I/O error
then we'll be shutting down the filesystem anyway and that means we'll
catch the error somewhere else.

However, on certain calls we should be returning an error - sync
transactions, fsync, sync writes, etc. so this isn't a pure black and
white distinction. Hence make xfs_log_force() a void function that issues
a warning to the syslog on error, and call _xfs_log_force() in all the
places where we actually care about the error status returned.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30832a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bece882..e29ea0a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -382,7 +382,27 @@ _xfs_log_force(
 		return xlog_state_sync_all(log, flags, log_flushed);
 	else
 		return xlog_state_sync(log, lsn, flags, log_flushed);
-}	/* xfs_log_force */
+}	/* _xfs_log_force */
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
+	error = _xfs_log_force(mp, lsn, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
+
 
 /*
  * Attaches a new iclog I/O completion callback routine during
@@ -634,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return 0;
 
-	xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 
 #ifdef DEBUG
 	first_iclog = iclog = log->l_iclog;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac04..d1d678e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int	  _xfs_log_force(struct xfs_mount *mp,
 			 xfs_lsn_t	lsn,
 			 uint		flags,
 			 int		*log_forced);
-#define xfs_log_force(mp, lsn, flags) \
-	_xfs_log_force(mp, lsn, flags, NULL);
+void	  xfs_log_force(struct xfs_mount	*mp,
+			xfs_lsn_t		lsn,
+			uint			flags);
 int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6..b0f31c0 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
 		 * when we return.
 		 */
 		if (iip && iip->ili_last_lsn) {
-			xfs_log_force(mp, iip->ili_last_lsn,
-					XFS_LOG_FORCE | XFS_LOG_SYNC);
+			error = _xfs_log_force(mp, iip->ili_last_lsn,
+					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
 		} else if (xfs_ipincount(ip) > 0) {
-			xfs_log_force(mp, (xfs_lsn_t)0,
-					XFS_LOG_FORCE | XFS_LOG_SYNC);
+			error = _xfs_log_force(mp, (xfs_lsn_t)0,
+					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
 		}
 
 	} else {
-- 
cgit v0.10.2


From 1bb7d6b5a82f1d9487fd44415484a368f7c87bed Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:38 +1000
Subject: [XFS] Catch log unmount failures.

Unmounting the log can fail. unlikely, but it can. Catch all the error
conditions an make sure it's propagated upwards.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30833a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e29ea0a..afaee30 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -697,7 +697,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 		xlog_state_want_sync(log, iclog);
-		(void) xlog_state_release_iclog(log, iclog);
+		error = xlog_state_release_iclog(log, iclog);
 
 		spin_lock(&log->l_icloglock);
 		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -736,7 +736,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		spin_unlock(&log->l_icloglock);
 
 		xlog_state_want_sync(log, iclog);
-		(void) xlog_state_release_iclog(log, iclog);
+		error =  xlog_state_release_iclog(log, iclog);
 
 		spin_lock(&log->l_icloglock);
 
@@ -751,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		}
 	}
 
-	return 0;
+	return error;
 }	/* xfs_log_unmount_write */
 
 /*
-- 
cgit v0.10.2


From d4055947bd0913864f4d8ac96bf1197338071622 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:35 +1000
Subject: [XFS] Don't error out on good I/Os.

xfsbdstrat() made all I/Os error out, good or bad. Fix it.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30836a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index f6dab5d..21c0dbc 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -885,8 +885,10 @@ xfsbdstrat(
 	struct xfs_buf		*bp)
 {
 	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp))
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
 		xfs_buf_iorequest(bp);
+		return;
+	}
 
 	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
 	xfs_bioerror_relse(bp);
-- 
cgit v0.10.2


From e6430037e9fd0b3d02ceaf5ab99bfe3ccb763be7 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:49 +1000
Subject: [XFS] fix logic error in xfs_alloc_ag_vextent_near()

Fix a logic error in xfs_alloc_ag_vextent_near(). This is a regression
introduced by the error handling changes.

SGI-PV: 890084
SGI-Modid: xfs-linux-melb:xfs-kern:30838a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index facdae1..1956f83 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -838,7 +838,7 @@ xfs_alloc_ag_vextent_near(
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
 					args->minlen, &ltbnoa, &ltlena);
-			if (ltlena >= args->minlen)
+			if (ltlena < args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-- 
cgit v0.10.2


From 7e20694d91f817f8e9f62404aca793ae0df4d98a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:55 +1000
Subject: [XFS] Remove periodic logging of in-core superblock counters.

xfssyncd triggers the logging of superblock counters every 30s if the
filesystem is made with lazy-count=1. This will prevent disks from idling
and spinning down as there will be a log write every 30s. With the way
counter recovery works for lazy-count=1, this code is unnecessary and
provides no real benefit, so just remove it.

SGI-PV: 980145
SGI-Modid: xfs-linux-melb:xfs-kern:30840a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index fb561be..865eb70 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1028,8 +1028,7 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR |
-				     SYNC_REFCACHE | SYNC_SUPER);
+		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4..7e60c77 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
 #define SYNC_REFCACHE		0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
 #define SYNC_IOWAIT		0x0100  /* wait for all I/O to complete */
-#define SYNC_SUPER		0x0200  /* flush superblock to disk */
 
 /*
  * When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 09e186d..fc48158 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1317,21 +1317,8 @@ xfs_syncsub(
 	}
 
 	/*
-	 * If asked, update the disk superblock with incore counter values if we
-	 * are using non-persistent counters so that they don't get too far out
-	 * of sync if we crash or get a forced shutdown. We don't want to force
-	 * this to disk, just get a transaction into the iclogs....
-	 */
-	if (flags & SYNC_SUPER) {
-		error = xfs_log_sbcount(mp, 0);
-		if (error)
-			last_error = error;
-	}
-
-	/*
 	 * Now check to see if the log needs a "dummy" transaction.
 	 */
-
 	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
 		xfs_trans_t *tp;
 		xfs_inode_t *ip;
-- 
cgit v0.10.2


From f6485057c5cfbc84e5eff639ddea1ce0d668607b Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:50:04 +1000
Subject: [XFS] Ensure the inode is joined in xfs_itruncate_finish

On success, we still need to join the inode to the current transaction in
xfs_itruncate_finish(). Fixes regression from error handling changes.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30845a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2bc2279..ca12acb 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1464,51 +1464,50 @@ xfs_itruncate_start(
 }
 
 /*
- * Shrink the file to the given new_size.  The new
- * size must be smaller than the current size.
- * This will free up the underlying blocks
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
+ * Shrink the file to the given new_size.  The new size must be smaller than
+ * the current size.  This will free up the underlying blocks in the removed
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
  *
- * The transaction passed to this routine must have made
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
- * This routine may commit the given transaction and
- * start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.
- * Some transaction will be returned to the caller to be
- * committed.  The incoming transaction must already include
- * the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction.  On
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
  *
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
- * and it indicates the fork which is to be truncated.  For the
- * attribute fork we only support truncation to size 0.
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
+ * indicates the fork which is to be truncated.  For the attribute fork we only
+ * support truncation to size 0.
  *
- * We use the sync parameter to indicate whether or not the first
- * transaction we perform might have to be synchronous.  For the attr fork,
- * it needs to be so if the unlink of the inode is not yet known to be
- * permanent in the log.  This keeps us from freeing and reusing the
- * blocks of the attribute fork before the unlink of the inode becomes
- * permanent.
+ * We use the sync parameter to indicate whether or not the first transaction
+ * we perform might have to be synchronous.  For the attr fork, it needs to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
+ * This keeps us from freeing and reusing the blocks of the attribute fork
+ * before the unlink of the inode becomes permanent.
  *
- * For the data fork, we normally have to run synchronously if we're
- * being called out of the inactive path or we're being called
- * out of the create path where we're truncating an existing file.
- * Either way, the truncate needs to be sync so blocks don't reappear
- * in the file with altered data in case of a crash.  wsync filesystems
- * can run the first case async because anything that shrinks the inode
- * has to run sync so by the time we're called here from inactive, the
- * inode size is permanently set to 0.
+ * For the data fork, we normally have to run synchronously if we're being
+ * called out of the inactive path or we're being called out of the create path
+ * where we're truncating an existing file.  Either way, the truncate needs to
+ * be sync so blocks don't reappear in the file with altered data in case of a
+ * crash.  wsync filesystems can run the first case async because anything that
+ * shrinks the inode has to run sync so by the time we're called here from
+ * inactive, the inode size is permanently set to 0.
  *
- * Calls from the truncate path always need to be sync unless we're
- * in a wsync filesystem and the file has already been unlinked.
+ * Calls from the truncate path always need to be sync unless we're in a wsync
+ * filesystem and the file has already been unlinked.
  *
- * The caller is responsible for correctly setting the sync parameter.
- * It gets too hard for us to guess here which path we're being called
- * out of just based on inode state.
+ * The caller is responsible for correctly setting the sync parameter.  It gets
+ * too hard for us to guess here which path we're being called out of just
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
  */
 int
 xfs_itruncate_finish(
@@ -1687,45 +1686,51 @@ xfs_itruncate_finish(
 		 */
 		error = xfs_bmap_finish(tp, &free_list, &committed);
 		ntp = *tp;
+		if (committed) {
+			/* link the inode into the next xact in the chain */
+			xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			xfs_trans_ihold(ntp, ip);
+		}
+
 		if (error) {
 			/*
-			 * If the bmap finish call encounters an error,
-			 * return to the caller where the transaction
-			 * can be properly aborted.  We just need to
-			 * make sure we're not holding any resources
-			 * that we were not when we came in.
+			 * If the bmap finish call encounters an error, return
+			 * to the caller where the transaction can be properly
+			 * aborted.  We just need to make sure we're not
+			 * holding any resources that we were not when we came
+			 * in.
 			 *
-			 * Aborting from this point might lose some
-			 * blocks in the file system, but oh well.
+			 * Aborting from this point might lose some blocks in
+			 * the file system, but oh well.
 			 */
 			xfs_bmap_cancel(&free_list);
-			if (committed)
-				goto error_join;
 			return error;
 		}
 
 		if (committed) {
 			/*
-			 * The first xact was committed, so add the inode to
-			 * the new one.  Mark it dirty so it will be logged and
+			 * Mark the inode dirty so it will be logged and
 			 * moved forward in the log as part of every commit.
 			 */
-			xfs_trans_ijoin(ntp, ip,
-					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-			xfs_trans_ihold(ntp, ip);
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
+
 		ntp = xfs_trans_dup(ntp);
 		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
-		if (error)
-			goto error_join;
-		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-					  XFS_TRANS_PERM_LOG_RES,
-					  XFS_ITRUNCATE_LOG_COUNT);
-		if (error)
-			goto error_join;
 
+		/* link the inode into the next transaction in the chain */
+		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ihold(ntp, ip);
+
+		if (!error)
+			error = xfs_trans_reserve(ntp, 0,
+					XFS_ITRUNCATE_LOG_RES(mp), 0,
+					XFS_TRANS_PERM_LOG_RES,
+					XFS_ITRUNCATE_LOG_COUNT);
+		if (error)
+			return error;
 	}
 	/*
 	 * Only update the size in the case of the data fork, but
@@ -1757,18 +1762,6 @@ xfs_itruncate_finish(
 	       (ip->i_d.di_nextents == 0));
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
 	return 0;
-
-error_join:
-	/*
-	 * Add the inode being truncated to the next chained transaction.  This
-	 * keeps things simple for the higher level code, because it always
-	 * knows that the inode is locked and held in the transaction that
-	 * returns to it whether errors occur or not.  We don't mark the inode
-	 * dirty so that this transaction can be easily aborted if possible.
-	 */
-	xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ihold(ntp, ip);
-	return error;
 }
 
 
-- 
cgit v0.10.2


From cb49dbb130e17a6f9af4cb4714cf6976cf09afdf Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 17 Apr 2008 16:50:09 +1000
Subject: [XFS] Always use di_forkoff when checking for attr space.

In the case where we mount a filesystem which was previously using the
attr2 format as attr1, returning the default mp->m_attroffset instead of
the per-inode di_forkoff for inline attribute fit calculations, may result
in corruption, if for example, the data fork is already taking more space
than the default fork offset and we try to add an extended attribute. Fix
tested by xfstests/186.

SGI-PV: 979606
SGI-Modid: xfs-linux-melb:xfs-kern:30861a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 96ba6aa..303d41e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 
 	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
 		if (bytes <= XFS_IFORK_ASIZE(dp))
-			return mp->m_attroffset >> 3;
+			return dp->i_d.di_forkoff;
 		return 0;
 	}
 
-- 
cgit v0.10.2


From 6d1337b29bf09a97682d39db36ac2d0dfc6659c0 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Thu, 17 Apr 2008 16:50:16 +1000
Subject: [XFS] xfs_bmap_compute_maxlevels should be based on di_forkoff

Fix up xfs_bmap_compute_maxlevels() to account for the case when we go
from using attr2 to using attr1. In that case attr1 will no longer
necessarily be at m_attr_offset>>3, but could be at a different value for
di_forkoff. Therefore, we return the worst case scenario using MINDBTPTRS
and MINABTPTRS, as this function is used for determining the maximum log
space.

SGI-PV: 979606
SGI-Modid: xfs-linux-melb:xfs-kern:30862a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6d9b5448..eb198c0 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4153,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
 	 * number of leaf entries, is controlled by the type of di_nextents
 	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
 	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be (m_attroffset >> 3)
+	 * because we could have mounted with ATTR2 and then mounted back
+	 * with ATTR1, keeping the di_forkoff's fixed but probably at
+	 * various positions. Therefore, for both ATTR1 and ATTR2
+	 * we have to assume the worst case scenario of a minimum size
+	 * available.
 	 */
 	if (whichfork == XFS_DATA_FORK) {
 		maxleafents = MAXEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
-			XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
 	} else {
 		maxleafents = MAXAEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
-			XFS_BMDR_SPACE_CALC(MINABTPTRS) :
-			mp->m_sb.sb_inodesize - mp->m_attroffset;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
 	}
 	maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
 	minleafrecs = mp->m_bmap_dmnr[0];
-- 
cgit v0.10.2


From f7d3c34788696f5ba9ac9fa414ad80e2a91d4b2e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 17 Apr 2008 16:50:22 +1000
Subject: [XFS] Remove CONFIG_XFS_SECURITY.

There is no point to the CONFIG_XFS_SECURITY option; it disables the
ability to set security attributes at runtime, but it does not actually
slim down or remove any code for runtime. Just remove it and always allow
security attributes to be set.

SGI-PV: 980310
SGI-Modid: xfs-linux-melb:xfs-kern:30877a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bc..524021f 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
 	  with or without the generic quota support enabled (CONFIG_QUOTA) -
 	  they are completely independent subsystems.
 
-config XFS_SECURITY
-	bool "XFS Security Label support"
-	depends on XFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute namespace for inode security
-	  labels in the XFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for inode security labels, say N.
-
 config XFS_POSIX_ACL
 	bool "XFS POSIX ACL support"
 	depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45..3efb7c6 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
 # define set_posix_acl_flag(sb)	do { } while (0)
 #endif
 
-#ifdef CONFIG_XFS_SECURITY
-# define XFS_SECURITY_STRING	"security attributes, "
-# define ENOSECURITY		0
-#else
-# define XFS_SECURITY_STRING
-# define ENOSECURITY		EOPNOTSUPP
-#endif
+#define XFS_SECURITY_STRING	"security attributes, "
 
 #ifdef CONFIG_XFS_RT
 # define XFS_REALTIME_STRING	"realtime, "
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321..36d781e 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2647,14 +2647,6 @@ attr_trusted_capable(
 }
 
 STATIC int
-attr_secure_capable(
-	bhv_vnode_t	*vp,
-	cred_t		*cred)
-{
-	return -ENOSECURITY;
-}
-
-STATIC int
 attr_system_set(
 	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
 	.attr_get	= attr_generic_get,
 	.attr_set	= attr_generic_set,
 	.attr_remove	= attr_generic_remove,
-	.attr_capable	= attr_secure_capable,
+	.attr_capable	= (attrcapable_t)fs_noerr,
 };
 
 struct attrnames attr_user = {
-- 
cgit v0.10.2


From e687330b5ed1ea899fdaf0dea50aba196b6e019a Mon Sep 17 00:00:00 2001
From: Donald Douwsma <donaldd@sgi.com>
Date: Thu, 17 Apr 2008 16:50:28 +1000
Subject: [XFS] Remove unused HAVE_SPLICE macro.

HAVE_SPLICE was part of the infrastructure for building 2.4 and 2.6
kernels out of the same tree. Now we don't build 2.4 kernels this

SGI-PV: 971046
SGI-Modid: xfs-linux-melb:xfs-kern:30878a

Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c9..765aaf6 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
 #define STATIC
 #define DEBUG 1
 #define XFS_BUF_LOCK_TRACKING 1
-/* #define QUOTADEBUG 1 */
+#define QUOTADEBUG 1
 #endif
 
 #ifdef CONFIG_XFS_TRACE
-- 
cgit v0.10.2


From 033bfb1a65242e0d60e6fc991cd9b3553053d334 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Fri, 11 Apr 2008 17:05:49 +1000
Subject: [XFS] Update XFS Documentation for ikeep and ihashsize

Update xfs docs for:
* In memory inode hashes has been removed.
* noikeep is now the default.

SGI-PV: 969561
SGI-Modid: 2.6.x-xfs-melb:linux:29481b

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tim Shimmin <tes@sgi.com>

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 74aeb14..10ba81f 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -52,16 +52,14 @@ When mounting an XFS filesystem, the following options are accepted.
 	and also gets the setgid bit set if it is a directory itself.
 
   ihashsize=value
-	Sets the number of hash buckets available for hashing the
-	in-memory inodes of the specified mount point.  If a value
-	of zero is used, the value selected by the default algorithm
-	will be displayed in /proc/mounts.
+	In memory inode hashes have been removed, so this option has
+	no function as of August 2007. Option is deprecated.
 
   ikeep/noikeep
-	When inode clusters are emptied of inodes, keep them around
-	on the disk (ikeep) - this is the traditional XFS behaviour
-	and is still the default for now.  Using the noikeep option,
-	inode clusters are returned to the free space pool.
+	When ikeep is specified, XFS does not delete empty inode clusters
+	and keeps them around on disk. ikeep is the traditional XFS
+	behaviour. When noikeep is specified, empty inode clusters
+	are returned to the free space pool. The default is noikeep.
 
   inode64
 	Indicates that XFS is allowed to create inodes at any location
-- 
cgit v0.10.2


From f6e9f28865552bd9d79a9df93cf120436b073223 Mon Sep 17 00:00:00 2001
From: Josef Sipek <jeffpc@josefsipek.net>
Date: Fri, 11 Apr 2008 17:11:02 +1000
Subject: [XFS] Update XFS documentation for noikeep/ikeep.

Mention how DMAPI affects default for noikeep.
Slightly modified since Josef's patch was based on
an old xfs.txt prior to Dave's (dgc) checkin which
missed going to oss.

Signed-off-by: Josef Sipek <jeffpc@josefsipek.net>
Signed-off-by: Tim Shimmin <tes@sgi.com>

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 10ba81f..0a1668b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -59,7 +59,8 @@ When mounting an XFS filesystem, the following options are accepted.
 	When ikeep is specified, XFS does not delete empty inode clusters
 	and keeps them around on disk. ikeep is the traditional XFS
 	behaviour. When noikeep is specified, empty inode clusters
-	are returned to the free space pool. The default is noikeep.
+	are returned to the free space pool. The default is noikeep for
+	non-DMAPI mounts, while ikeep is the default when DMAPI is in use.
 
   inode64
 	Indicates that XFS is allowed to create inodes at any location
-- 
cgit v0.10.2


From 3b2816be271b8b364294a5b48721a3e68af46cfa Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 12:43:35 +1000
Subject: [XFS] The forward declarations for the xfs_ioctl() helpers and the
 associated comment about gcc behavior really aren't needed; all of these
 functions are marked STATIC which includes noinline, and the stack usage
 won't be a problem.

This effectively just removes the forward declarations and moves
xfs_ioctl() back to the end of the file.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30534a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index c6399b2..640c8b6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -651,314 +651,6 @@ xfs_attrmulti_by_handle(
 	return -error;
 }
 
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions.  Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-
-STATIC int
-xfs_ioc_space(
-	struct xfs_inode	*ip,
-	struct inode		*inode,
-	struct file		*filp,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_bulkstat(
-	xfs_mount_t		*mp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_xattr(
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgetxattr(
-	xfs_inode_t		*ip,
-	int			attr,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmap(
-	struct xfs_inode	*ip,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmapx(
-	struct xfs_inode	*ip,
-	void			__user *arg);
-
-int
-xfs_ioctl(
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	int			ioflags,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	struct inode		*inode = filp->f_path.dentry->d_inode;
-	xfs_mount_t		*mp = ip->i_mount;
-	int			error;
-
-	xfs_itrace_entry(XFS_I(inode));
-	switch (cmd) {
-
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP64:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64:
-		/*
-		 * Only allow the sys admin to reserve space unless
-		 * unwritten extents are enabled.
-		 */
-		if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-		    !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
-
-	case XFS_IOC_DIOINFO: {
-		struct dioattr	da;
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-			mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
-		if (copy_to_user(arg, &da, sizeof(da)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_FSBULKSTAT_SINGLE:
-	case XFS_IOC_FSBULKSTAT:
-	case XFS_IOC_FSINUMBERS:
-		return xfs_ioc_bulkstat(mp, cmd, arg);
-
-	case XFS_IOC_FSGEOMETRY_V1:
-		return xfs_ioc_fsgeometry_v1(mp, arg);
-
-	case XFS_IOC_FSGEOMETRY:
-		return xfs_ioc_fsgeometry(mp, arg);
-
-	case XFS_IOC_GETVERSION:
-		return put_user(inode->i_generation, (int __user *)arg);
-
-	case XFS_IOC_FSGETXATTR:
-		return xfs_ioc_fsgetxattr(ip, 0, arg);
-	case XFS_IOC_FSGETXATTRA:
-		return xfs_ioc_fsgetxattr(ip, 1, arg);
-	case XFS_IOC_GETXFLAGS:
-	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_xattr(ip, filp, cmd, arg);
-
-	case XFS_IOC_FSSETDM: {
-		struct fsdmidata	dmi;
-
-		if (copy_from_user(&dmi, arg, sizeof(dmi)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
-				dmi.fsd_dmstate);
-		return -error;
-	}
-
-	case XFS_IOC_GETBMAP:
-	case XFS_IOC_GETBMAPA:
-		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-
-	case XFS_IOC_GETBMAPX:
-		return xfs_ioc_getbmapx(ip, arg);
-
-	case XFS_IOC_FD_TO_HANDLE:
-	case XFS_IOC_PATH_TO_HANDLE:
-	case XFS_IOC_PATH_TO_FSHANDLE:
-		return xfs_find_handle(cmd, arg);
-
-	case XFS_IOC_OPEN_BY_HANDLE:
-		return xfs_open_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_FSSETDM_BY_HANDLE:
-		return xfs_fssetdm_by_handle(mp, arg, inode);
-
-	case XFS_IOC_READLINK_BY_HANDLE:
-		return xfs_readlink_by_handle(mp, arg, inode);
-
-	case XFS_IOC_ATTRLIST_BY_HANDLE:
-		return xfs_attrlist_by_handle(mp, arg, inode);
-
-	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, inode);
-
-	case XFS_IOC_SWAPEXT: {
-		error = xfs_swapext((struct xfs_swapext __user *)arg);
-		return -error;
-	}
-
-	case XFS_IOC_FSCOUNTS: {
-		xfs_fsop_counts_t out;
-
-		error = xfs_fs_counts(mp, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_SET_RESBLKS: {
-		xfs_fsop_resblks_t inout;
-		__uint64_t	   in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&inout, arg, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-
-		/* input parameter is passed in resblks field of structure */
-		in = inout.resblks;
-		error = xfs_reserve_blocks(mp, &in, &inout);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &inout, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_GET_RESBLKS: {
-		xfs_fsop_resblks_t out;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_reserve_blocks(mp, NULL, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-
-		return 0;
-	}
-
-	case XFS_IOC_FSGROWFSDATA: {
-		xfs_growfs_data_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_data(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSLOG: {
-		xfs_growfs_log_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_log(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSRT: {
-		xfs_growfs_rt_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_rt(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FREEZE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (inode->i_sb->s_frozen == SB_UNFROZEN)
-			freeze_bdev(inode->i_sb->s_bdev);
-		return 0;
-
-	case XFS_IOC_THAW:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		if (inode->i_sb->s_frozen != SB_UNFROZEN)
-			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-		return 0;
-
-	case XFS_IOC_GOINGDOWN: {
-		__uint32_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (get_user(in, (__uint32_t __user *)arg))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_fs_goingdown(mp, in);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_INJECTION: {
-		xfs_error_injection_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_errortag_add(in.errtag, mp);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_CLEARALL:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_errortag_clearall(mp, 1);
-		return -error;
-
-	default:
-		return -ENOTTY;
-	}
-}
-
 STATIC int
 xfs_ioc_space(
 	struct xfs_inode	*ip,
@@ -1398,12 +1090,10 @@ xfs_ioctl(
 		return xfs_ioc_fsgetxattr(ip, 0, arg);
 	case XFS_IOC_FSGETXATTRA:
 		return xfs_ioc_fsgetxattr(ip, 1, arg);
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_fssetxattr(ip, filp, arg);
 	case XFS_IOC_GETXFLAGS:
-		return xfs_ioc_getxflags(ip, arg);
 	case XFS_IOC_SETXFLAGS:
-		return xfs_ioc_setxflags(ip, filp, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_xattr(ip, filp, cmd, arg);
 
 	case XFS_IOC_FSSETDM: {
 		struct fsdmidata	dmi;
-- 
cgit v0.10.2


From 65e67f5165c8a156b34ee7adf65d5ed3b16a910d Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 12:59:45 +1000
Subject: [XFS] Fix merge failure


diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 640c8b6..bf77597 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1090,10 +1090,12 @@ xfs_ioctl(
 		return xfs_ioc_fsgetxattr(ip, 0, arg);
 	case XFS_IOC_FSGETXATTRA:
 		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
 	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
 	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_xattr(ip, filp, cmd, arg);
+		return xfs_ioc_setxflags(ip, filp, arg);
 
 	case XFS_IOC_FSSETDM: {
 		struct fsdmidata	dmi;
-- 
cgit v0.10.2