From 8b712cd58adfe6aeeb0be4ecc011dc35620719e7 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 7 Jul 2009 17:22:12 -0400
Subject: ocfs2: Fixup orphan scan cleanup after failed mount

If the mount fails for any reason, ocfs2_dismount_volume calls
ocfs2_orphan_scan_stop. It requires that ocfs2_orphan_scan_init
be called to setup the mutex and work queues, but that doesn't
happen if the mount has failed and we oops accessing an uninitialized
work queue.

This patch splits the init and startup of the orphan scan, eliminating
the oops.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760..c48b93a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
 	os->os_osb = osb;
 	os->os_count = 0;
 	os->os_seqno = 0;
-	os->os_scantime = CURRENT_TIME;
 	mutex_init(&os->os_lock);
 	INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
 
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_scantime = CURRENT_TIME;
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f..81e8abc 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349..63af2e3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1182,7 +1182,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	wake_up(&osb->osb_mount_event);
 
 	/* Start this when the mount is almost sure of being successful */
-	ocfs2_orphan_scan_init(osb);
+	ocfs2_orphan_scan_start(osb);
 
 	mlog_exit(status);
 	return status;
@@ -1981,6 +1981,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+	ocfs2_orphan_scan_init(osb);
+
 	status = ocfs2_recovery_init(osb);
 	if (status) {
 		mlog(ML_ERROR, "Unable to initialize recovery state\n");
-- 
cgit v0.10.2


From 17ae26b669886efe237b77439e43eb390fceb119 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Tue, 7 Jul 2009 15:51:40 +0800
Subject: ocfs2: trivial fix for s/migrate/migration/ in dlmrecovery.c logging

in dlmrecovery.c:1121, replace 'migrate' to 'migration' to keep the consistency
by comparing to other lines with the similar log info in the same file.

Signed-off-by: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260..43e6e32 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
 
 	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
 	     dlm->name, res->lockname.len, res->lockname.name,
-	     orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
 	     send_to);
 
 	/* send it */
-- 
cgit v0.10.2


From 812e7a6a43fc34bc8f70c2b80db4ea5997d66ea8 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Fri, 10 Jul 2009 13:26:04 +0800
Subject: ocfs2: log the actual return value of ocfs2_file_aio_write()

in ocfs2_file_aio_write(), log_exit() could don't log the value
which is really returned. this patch fixes it.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e4..a49fa44 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1918,8 +1918,10 @@ out_sems:
 
 	mutex_unlock(&inode->i_mutex);
 
+	if (written)
+		ret = written;
 	mlog_exit(ret);
-	return written ? written : ret;
+	return ret;
 }
 
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
-- 
cgit v0.10.2


From cefcb800fa9536bb6f29485c53e6c82a65b0c022 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Sat, 11 Jul 2009 10:57:27 -0500
Subject: ocfs2: Initialize count in aio_write before generic_write_checks

generic_write_checks() expects count to be initialized to the size of
the write.  Writes to files open with O_DIRECT|O_LARGEFILE write 0 bytes
because count is uninitialized.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a49fa44..aa501d3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
 		if (ret)
 			goto out_dio;
 
+		count = ocount;
 		ret = generic_write_checks(file, ppos, &count,
 					   S_ISBLK(inode->i_mode));
 		if (ret)
-- 
cgit v0.10.2


From 44d8e4e1e9b941065eb7578669fe51eb65c73e63 Mon Sep 17 00:00:00 2001
From: Subrata Modak <subrata@linux.vnet.ibm.com>
Date: Tue, 14 Jul 2009 01:19:31 +0530
Subject: ocfs2: Fix compilation warning for fs/ocfs2/xattr.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc 4.4.1 generates the following build warning on i386:

CC [M]  fs/ocfs2/xattr.o
fs/ocfs2/xattr.c: In function ‘ocfs2_xattr_block_get’:
fs/ocfs2/xattr.c:1055: warning: ‘block_off’ may be used uninitialized in this function

The following fix is based on a similar approach by David Howells
few days back: http://lkml.org/lkml/2009/7/9/109,

Signed-off-by: Subrata Modak<subrata@linux.vnet.ibm.com>,
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e2..d1a27cd 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	struct ocfs2_xattr_block *xb;
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
-	int ret = -ENODATA, name_offset, name_len, block_off, i;
+	int ret = -ENODATA, name_offset, name_len, i;
+	int uninitialized_var(block_off);
 
 	xs->bucket = ocfs2_xattr_bucket_new(inode);
 	if (!xs->bucket) {
-- 
cgit v0.10.2


From cbfa9639aea5007313b75ec43b689d5f9e0c037d Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 13 Jul 2009 11:38:23 +0800
Subject: ocfs2: Fix error return in ocfs2_write_cluster()

A typo caused ocfs2_write_cluster() to return 0 in some error cases.
Fix it.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3..25aced3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1301,7 +1301,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
-				tmpret = ret;
+				ret = tmpret;
 		}
 	}
 
-- 
cgit v0.10.2


From 1f4cea3790bf44137192f441ee84af256e3bf809 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 13 Jul 2009 11:38:58 +0800
Subject: ocfs2: Fail ocfs2_get_block() immediately when a block needs
 allocation

ocfs2_get_block() does no allocation.  Hole filling for writes should
have happened farther up in the call chain.  We detect this case and
print an error, but we then continue with the function.  We should be
exiting immediately.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 25aced3..e511df1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 			dump_stack();
+			goto bail;
 		}
 
 		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-- 
cgit v0.10.2


From 3c5e10683e684ef45614c9071847e48f633d9806 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 21 Jul 2009 15:42:05 +0800
Subject: ocfs2: Add extra credits and access the modified bh in
 update_edge_lengths.

In normal tree rotation left process, we will never touch the tree
branch above subtree_index and ocfs2_extend_rotate_transaction doesn't
reserve the credits for them either.

But when we want to delete the rightmost extent block, we have to update
the rightmost records for all the rightmost branch(See
ocfs2_update_edge_lengths), so we have to allocate extra credits for them.
What's more, we have to access them also.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4..11085af 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2476,15 +2476,37 @@ out_ret_path:
 	return ret;
 }
 
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
-				      struct ocfs2_path *path)
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+				     int subtree_index, struct ocfs2_path *path)
 {
-	int i, idx;
+	int i, idx, ret;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_block *eb;
 	u32 range;
 
+	/*
+	 * In normal tree rotation process, we will never touch the
+	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+	 * doesn't reserve the credits for them either.
+	 *
+	 * But we do have a special case here which will update the rightmost
+	 * records for all the bh in the path.
+	 * So we have to allocate extra credits and access them.
+	 */
+	ret = ocfs2_extend_trans(handle,
+				 handle->h_buffer_credits + subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(inode, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/* Path should always be rightmost. */
 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2527,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
 
 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
 	}
+out:
+	return ret;
 }
 
 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2741,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 	if (del_right_subtree) {
 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
 				     subtree_index, dealloc);
-		ocfs2_update_edge_lengths(inode, handle, left_path);
+		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3063,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 
 		ocfs2_unlink_subtree(inode, handle, left_path, path,
 				     subtree_index, dealloc);
-		ocfs2_update_edge_lengths(inode, handle, left_path);
+		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-- 
cgit v0.10.2


From f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Jul 2009 12:12:36 +0200
Subject: ocfs2: Fix deadlock on umount

In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we moved the dentry lock
put process into ocfs2_wq. This causes problems during umount because ocfs2_wq
can drop references to inodes while they are being invalidated by
invalidate_inodes() causing all sorts of nasty things (invalidate_inodes()
ending in an infinite loop, "Busy inodes after umount" messages etc.).

We fix the problem by stopping ocfs2_wq from doing any further releasing of
inode references on the superblock being unmounted, wait until it finishes
the current round of releasing and finally cleaning up all the references in
dentry_lock_list from ocfs2_put_super().

The issue was tracked down by Tao Ma <tao.ma@oracle.com>.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431..2f28b7d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
 	return ret;
 }
 
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
 
 /* We limit the number of dentry locks to drop in one go. We have
  * this limit so that we don't starve other users of ocfs2_wq. */
 #define DL_INODE_DROP_COUNT 64
 
 /* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
 {
-	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-					       dentry_lock_work);
 	struct ocfs2_dentry_lock *dl;
-	int drop_count = DL_INODE_DROP_COUNT;
 
 	spin_lock(&dentry_list_lock);
-	while (osb->dentry_lock_list && drop_count--) {
+	while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
 		dl = osb->dentry_lock_list;
 		osb->dentry_lock_list = dl->dl_next;
 		spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
 		kfree(dl);
 		spin_lock(&dentry_list_lock);
 	}
-	if (osb->dentry_lock_list)
+	spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+
+	__ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+	/*
+	 * Don't queue dropping if umount is in progress. We flush the
+	 * list in ocfs2_dismount_volume
+	 */
+	spin_lock(&dentry_list_lock);
+	if (osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
 		queue_work(ocfs2_wq, &osb->dentry_lock_work);
 	spin_unlock(&dentry_list_lock);
 }
 
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+	__ocfs2_drop_dl_inodes(osb, -1);
+}
+
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 	/* We leave dropping of inode reference to ocfs2_wq as that can
 	 * possibly lead to inode deletion which gets tricky */
 	spin_lock(&dentry_list_lock);
-	if (!osb->dentry_lock_list)
+	if (!osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
 		queue_work(ocfs2_wq, &osb->dentry_lock_work);
 	dl->dl_next = osb->dentry_lock_list;
 	osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e7..f5dd178 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 			     u64 parent_blkno);
 
+extern spinlock_t dentry_list_lock;
+
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl);
 
 void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
 				      int skip_unhashed);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345eb..39e1d5a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 
-#define OCFS2_OSB_SOFT_RO	0x0001
-#define OCFS2_OSB_HARD_RO	0x0002
-#define OCFS2_OSB_ERROR_FS	0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM	60
+#define OCFS2_OSB_SOFT_RO			0x0001
+#define OCFS2_OSB_HARD_RO			0x0002
+#define OCFS2_OSB_ERROR_FS			0x0004
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED	0x0008
+
+#define OCFS2_DEFAULT_ATIME_QUANTUM		60
 
 struct ocfs2_journal;
 struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
 	spin_unlock(&osb->osb_lock);
 }
 
+
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+						 unsigned long flag)
+{
+	unsigned long ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & flag;
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
 				     int hard)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 63af2e3..f289387 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
 			   mnt);
 }
 
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	/* Prevent further queueing of inode drop events */
+	spin_lock(&dentry_list_lock);
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+	spin_unlock(&dentry_list_lock);
+	/* Wait for work to finish and/or remove it */
+	cancel_work_sync(&osb->dentry_lock_work);
+
+	kill_block_super(sb);
+}
+
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
 	.get_sb         = ocfs2_get_sb, /* is this called when we mount
 					* the fs? */
-	.kill_sb        = kill_block_super, /* set to the generic one
-					     * right now, but do we
-					     * need to change that? */
+	.kill_sb        = ocfs2_kill_sb,
+
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
 	.next           = NULL
 };
@@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	debugfs_remove(osb->osb_ctxt);
 
+	/*
+	 * Flush inode dropping work queue so that deletes are
+	 * performed while the filesystem is still working
+	 */
+	ocfs2_drop_all_dl_inodes(osb);
+
 	/* Orphan scan should be stopped as early as possible */
 	ocfs2_orphan_scan_stop(osb);
 
-- 
cgit v0.10.2


From 82e12644cf5227dab15201fbcaf0ca6330ebd70f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 23 Jul 2009 08:12:58 +0800
Subject: ocfs2: Use ocfs2_rec_clusters in ocfs2_adjust_adjacent_records.

In ocfs2_adjust_adjacent_records, we will adjust adjacent records
according to the extent_list in the lower level. But actually
the lower level tree will either be a leaf or a branch. If we only
use ocfs2_is_empty_extent we will meet with some problem if the lower
tree is a branch (tree_depth > 1). So use !ocfs2_rec_clusters instead.
And actually only the leaf record can have holes. So add a BUG_ON
for non-leaf branch.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 11085af..f9a3e89 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
 	 * immediately to their right.
 	 */
 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
-	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+		BUG_ON(right_child_el->l_tree_depth);
 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
 	}
-- 
cgit v0.10.2


From b57ac2c43e66cb4aa76c9d2d0e9e0e04f19cc5a6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:15 +0200
Subject: ocfs2: Make global quota files blocksize aligned

Change i_size of global quota files so that it always remains aligned to block
size. This is mainly because the end of quota block may contain checksum (if
checksumming is enabled) and it's a bit awkward for it to be "outside" of quota
file (and it makes life harder for ocfs2-tools).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60c..a66cb82 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -211,14 +211,17 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 
 	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
 	if (gqinode->i_size < off + len) {
+		loff_t rounded_end =
+				ocfs2_align_bytes_to_blocks(sb, off + len);
+
 		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		err = ocfs2_extend_no_holes(gqinode, off + len, off);
+		err = ocfs2_extend_no_holes(gqinode, rounded_end, off);
 		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
 		if (err < 0)
 			goto out;
 		err = ocfs2_simple_size_update(gqinode,
 					       oinfo->dqi_gqi_bh,
-					       off + len);
+					       rounded_end);
 		if (err < 0)
 			goto out;
 		new = 1;
-- 
cgit v0.10.2


From 4b3fa1904c0d192461ebba692e4940a334b74353 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:16 +0200
Subject: ocfs2: Mark buffer uptodate before calling ocfs2_journal_access_dq()

In a code path extending local quota files we marked new header
buffer uptodate only after calling ocfs2_journal_access_dq() which
triggers a bug. Fix it and also call ocfs2 variant of the function
marking buffer uptodate.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa..b624f9b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "quota.h"
+#include "uptodate.h"
 
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -979,6 +980,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
+
 	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
 
 	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
@@ -999,7 +1002,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	memset(dchunk->dqc_bitmap, 0,
 	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
 	       OCFS2_QBLK_RESERVED_SPACE);
-	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	status = ocfs2_journal_dirty(handle, bh);
 	if (status < 0) {
-- 
cgit v0.10.2


From 0e7f387bf386c99e8ee322649fe4fc23b9cccc6c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:17 +0200
Subject: ocfs2: Initialize blocks allocated to local quota file

When we extend local quota file, we should initialize data
in newly allocated block. Firstly because on recovery we could
parse bogus data, secondly so that block checksums are properly
computed.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b624f9b..9d2993d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -941,7 +941,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	struct ocfs2_local_disk_chunk *dchunk;
 	int status;
 	handle_t *handle;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh = NULL, *dbh = NULL;
 	u64 p_blkno;
 
 	/* We are protected by dqio_sem so no locking needed */
@@ -965,34 +965,32 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
 
+	/* Initialize chunk header */
 	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
 					     &p_blkno, NULL, NULL);
 	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out;
+		goto out_trans;
 	}
 	bh = sb_getblk(sb, p_blkno);
 	if (!bh) {
 		status = -ENOMEM;
 		mlog_errno(status);
-		goto out;
+		goto out_trans;
 	}
-	ocfs2_set_new_buffer_uptodate(lqinode, bh);
-
 	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out;
-	}
-
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
 	status = ocfs2_journal_access_dq(handle, lqinode, bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_trans;
@@ -1009,6 +1007,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		goto out_trans;
 	}
 
+	/* Initialize new block with structures */
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dbh = sb_getblk(sb, p_blkno);
+	if (!dbh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+	status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(dbh);
+	status = ocfs2_journal_dirty(handle, dbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	/* Update local quotafile info */
 	oinfo->dqi_blocks += 2;
 	oinfo->dqi_chunks++;
 	status = ocfs2_local_write_info(sb, type);
@@ -1033,6 +1063,7 @@ out_trans:
 	ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out:
 	brelse(bh);
+	brelse(dbh);
 	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
 	return ERR_PTR(status);
 }
@@ -1050,6 +1081,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	struct ocfs2_local_disk_chunk *dchunk;
 	int epb = ol_quota_entries_per_block(sb);
 	unsigned int chunk_blocks;
+	struct buffer_head *bh;
+	u64 p_blkno;
 	int status;
 	handle_t *handle;
 
@@ -1077,12 +1110,46 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		mlog_errno(status);
 		goto out;
 	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+
+	/* Get buffer from the just added block */
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out;
 	}
+	/* Zero created block */
+	status = ocfs2_journal_access_dq(handle, lqinode, bh,
+				 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	memset(bh->b_data, 0, sb->s_blocksize);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	/* Update chunk header */
 	status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
 				 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
@@ -1099,6 +1166,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		mlog_errno(status);
 		goto out_trans;
 	}
+	/* Update file header */
 	oinfo->dqi_blocks++;
 	status = ocfs2_local_write_info(sb, type);
 	if (status < 0) {
-- 
cgit v0.10.2


From 7669f54c55df225cbb2db939a6c9f111445ffc1c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:18 +0200
Subject: ocfs2: Zero out padding of on disk dquot structure

Padding fields of on-disk dquot structure were not zeroed. Zero them
so that it's easier to use them later.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index a66cb82..7248db6 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_pad1 = d->dqb_pad2 = 0;
 }
 
 static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
-- 
cgit v0.10.2


From 1c1d9793ff6720531c0125a28d321f283716e32f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:19 +0200
Subject: ocfs2: Fix initialization of blockcheck stats

We just set blockcheck stats to zeros but we should also
properly initialize the spinlock there.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f289387..b0ee0fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 		}
 		di = (struct ocfs2_dinode *) (*bh)->b_data;
 		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+		spin_lock_init(&stats->b_lock);
 		status = ocfs2_verify_volume(di, *bh, blksize, stats);
 		if (status >= 0)
 			goto bail;
-- 
cgit v0.10.2


From 4539f1df25bcd0fdf0d8a5e2c92de6bece83c7a0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:20 +0200
Subject: ocfs2: Remove syncjiff field from quota info

syncjiff is just a converted value of syncms. Some places which
are updating syncms forgot to update syncjiff as well. Since the
conversion is just a simple division / multiplication and it does
not happen frequently, just remove the syncjiff field to avoid
forgotten conversions.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e..3fb96fcd 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
 	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
 	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
 	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
-	unsigned int dqi_syncjiff;	/* Precomputed dqi_syncms in jiffies */
 	struct list_head dqi_chunk;	/* List of chunks */
 	struct inode *dqi_gqinode;	/* Global quota file inode */
 	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7248db6..dab4546 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -346,7 +346,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
-	oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
 	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
 	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
 	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -356,7 +355,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
 	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
 	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   oinfo->dqi_syncjiff);
+			   msecs_to_jiffies(oinfo->dqi_syncms));
 
 out_err:
 	mlog_exit(status);
@@ -611,7 +610,7 @@ static void qsync_work_fn(struct work_struct *work)
 
 	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
 	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   oinfo->dqi_syncjiff);
+			   msecs_to_jiffies(oinfo->dqi_syncms));
 }
 
 /*
-- 
cgit v0.10.2


From 0584974a77796581eb3a64b6c5005edac4a95128 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:21 +0200
Subject: ocfs2: Define credit counts for quota operations

Numbers of needed credits for some quota operations were written
as raw numbers. Create appropriate defines instead.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 81e8abc..4a4d3b5 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -330,20 +330,27 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
 /* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
 /*
  * The two writes below can accidentally see global info dirty due
  * to set_info() quotactl so make them prepared for the writes.
  */
 /* quota data block, global info */
 /* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			      OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
 /* global quota data block, local quota data block, global quota inode,
  * global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			     2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
 static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index dab4546..cc8785c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -648,10 +648,15 @@ int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 		return 0;
 
 	oinfo = sb_dqinfo(sb, type)->dqi_priv;
-	/* We modify tree, leaf block, global info, local chunk header,
-	 * global and local inode */
-	return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
-	       2 * OCFS2_INODE_UPDATE_CREDITS;
+	/*
+	 * We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+	 * accounts for inode update
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth +
+	       OCFS2_QINFO_WRITE_CREDITS +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       OCFS2_INODE_UPDATE_CREDITS;
 }
 
 static int ocfs2_release_dquot(struct dquot *dquot)
@@ -701,7 +706,10 @@ int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
 	 * global file we can modify info, tree and leaf block */
 	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
 	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-	       3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+	       OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       oinfo->dqi_gi.dqi_qtree_depth +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
 }
 
 static int ocfs2_acquire_dquot(struct dquot *dquot)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9d2993d..bdb09cb 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -101,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
 	handle_t *handle;
 	int status;
 
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -611,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 			goto out_bh;
 		/* Mark quota file as clean if we are recovering quota file of
 		 * some other node. */
-		handle = ocfs2_start_trans(osb, 1);
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_LOCAL_QINFO_WRITE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
@@ -965,7 +967,10 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	/* Local quota info and two new blocks we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -1128,7 +1133,10 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	}
 	ocfs2_set_new_buffer_uptodate(lqinode, bh);
 
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	/* Local quota info, chunk header and the new block we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
-- 
cgit v0.10.2


From 7194e6f9c083e87171ddfc8b746f05e007f58132 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <Adrian.Hunter@nokia.com>
Date: Fri, 24 Jul 2009 17:05:00 +0300
Subject: UBI: fix double free on error path

If we fail in 'ubi_eba_init_scan()', we free
'ubi->volumes[i]->eba_tbl' in there, but also later free it
in 'free_internal_volumes()'. Fix this by assigning NULL
to 'ubi->volumes[i]->eba_tbl' after it is freed.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>

diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index 0f2034c..e4d9ef0 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -1254,6 +1254,7 @@ out_free:
 		if (!ubi->volumes[i])
 			continue;
 		kfree(ubi->volumes[i]->eba_tbl);
+		ubi->volumes[i]->eba_tbl = NULL;
 	}
 	return err;
 }
-- 
cgit v0.10.2


From 32bc4820287a1a03982979515949e8ea56eac641 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <Adrian.Hunter@nokia.com>
Date: Fri, 24 Jul 2009 19:16:04 +0300
Subject: UBI: compatible fallback in absense of sequence numbers

Fall back onto thinking everything's OK if either of the sequence
numbers we are asked to compare is zero, which is what was used
before sequence numbers were introduced.

[ Artem: modified the patch to be applicable to upstream UBI, added
        big comment ]

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>

diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c
index a423131..b847745 100644
--- a/drivers/mtd/ubi/scan.c
+++ b/drivers/mtd/ubi/scan.c
@@ -781,11 +781,22 @@ static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si,
 			return -EINVAL;
 		}
 
+		/*
+		 * Make sure that all PEBs have the same image sequence number.
+		 * This allows us to detect situations when users flash UBI
+		 * images incorrectly, so that the flash has the new UBI image
+		 * and leftovers from the old one. This feature was added
+		 * relatively recently, and the sequence number was always
+		 * zero, because old UBI implementations always set it to zero.
+		 * For this reasons, we do not panic if some PEBs have zero
+		 * sequence number, while other PEBs have non-zero sequence
+		 * number.
+		 */
 		image_seq = be32_to_cpu(ech->image_seq);
 		if (!si->image_seq_set) {
 			ubi->image_seq = image_seq;
 			si->image_seq_set = 1;
-		} else if (ubi->image_seq != image_seq) {
+		} else if (ubi->image_seq && ubi->image_seq != image_seq) {
 			ubi_err("bad image sequence number %d in PEB %d, "
 				"expected %d", image_seq, pnum, ubi->image_seq);
 			ubi_dbg_dump_ec_hdr(ech);
-- 
cgit v0.10.2


From e9956fae7dbce6ac770e7a00436c496ddbbd3215 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 30 Jul 2009 16:07:10 +0800
Subject: ocfs2/quota: Release lock for error in ocfs2_quota_write.

ocfs2_quota_write needs to release the lock if it fails to
read quota block. So use "goto out" instead of "return err".

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index cc8785c..d604a6a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -238,7 +238,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	}
 	if (err) {
 		mlog_errno(err);
-		return err;
+		goto out;
 	}
 	lock_buffer(bh);
 	if (new)
-- 
cgit v0.10.2


From ab57a40827d99e2d8e59066a56b93bf6c844c916 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Fri, 31 Jul 2009 14:28:02 -0500
Subject: ocfs2: Remove redundant BUG_ON in __dlm_queue_ast()

We BUG_ON() the same thing twice.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe..81eff8e 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 		     lock->ast_pending, lock->ml.type);
 		BUG();
 	}
-	BUG_ON(!list_empty(&lock->ast_list));
 	if (lock->ast_pending)
 		mlog(0, "lock has an ast getting flushed right now\n");
 
-- 
cgit v0.10.2


From 7d5b005652bc5ae3e1e0efc53fd0e25a643ec506 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 4 Aug 2009 15:34:22 -0700
Subject: x86: Fix VMI && stack protector

With CONFIG_STACK_PROTECTOR turned on, VMI doesn't boot with
more than one processor. The problem is with the gs value not
being initialized correctly when registering the secondary
processor for VMI's case.

The patch below initializes the gs value for the AP to
__KERNEL_STACK_CANARY. Without this the secondary processor
keeps on taking a GP on every gs access.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: <stable@kernel.org> # for v2.6.30.x
LKML-Reference: <1249425262.18955.40.camel@ank32.eng.vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423..95a7289 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 	ap.ds = __USER_DS;
 	ap.es = __USER_DS;
 	ap.fs = __KERNEL_PERCPU;
-	ap.gs = 0;
+	ap.gs = __KERNEL_STACK_CANARY;
 
 	ap.eflags = 0;
 
-- 
cgit v0.10.2


From e125e7b6944898831b56739a5448e705578bf7e2 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 2 Jul 2009 21:45:47 +0200
Subject: KVM: Fix KVM_GET_MSR_INDEX_LIST

So far, KVM copied the emulated_msrs (only MSR_IA32_MISC_ENABLE) to a
wrong address in user space due to broken pointer arithmetic. This
caused subtle corruption up there (missing MSR_IA32_MISC_ENABLE had
probably no practical relevance). Moreover, the size check for the
user-provided kvm_msr_list forgot about emulated MSRs.

Cc: stable@kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe5474a..7bc3114 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1079,14 +1079,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 			goto out;
 		r = -E2BIG;
-		if (n < num_msrs_to_save)
+		if (n < msr_list.nmsrs)
 			goto out;
 		r = -EFAULT;
 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 				 num_msrs_to_save * sizeof(u32)))
 			goto out;
-		if (copy_to_user(user_msr_list->indices
-				 + num_msrs_to_save * sizeof(u32),
+		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 				 &emulated_msrs,
 				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
 			goto out;
-- 
cgit v0.10.2


From 0ff77873b1318fc2d77a85e70690d3cd6cafbd41 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 2 Jul 2009 20:02:15 -0300
Subject: KVM: PIT: fix kpit_elapsed division by zero

Fix division by zero triggered by latch count command on uninitialized
counter.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d2..21f68e0 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	ktime_t remaining;
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
+	if (!ps->pit_timer.period)
+		return 0;
+
 	/*
 	 * The Counter does not stop when it reaches zero. In
 	 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
-- 
cgit v0.10.2


From d6289b9365c3f622a8cfe62c4fb054bb70b5061a Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 22 Jun 2009 15:27:56 -0300
Subject: KVM: x86: verify MTRR/PAT validity

Do not allow invalid memory types in MTRR/PAT (generating a #GP
otherwise).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bc3114..3d45290 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr)
 	return false;
 }
 
+static bool valid_pat_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	int i;
+
+	if (!msr_mtrr_valid(msr))
+		return false;
+
+	if (msr == MSR_IA32_CR_PAT) {
+		for (i = 0; i < 8; i++)
+			if (!valid_pat_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	} else if (msr == MSR_MTRRdefType) {
+		if (data & ~0xcff)
+			return false;
+		return valid_mtrr_type(data & 0xff);
+	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+		for (i = 0; i < 8 ; i++)
+			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	}
+
+	/* variable MTRRs */
+	return valid_mtrr_type(data & 0xff);
+}
+
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 
-	if (!msr_mtrr_valid(msr))
+	if (!mtrr_valid(vcpu, msr, data))
 		return 1;
 
 	if (msr == MSR_MTRRdefType) {
-- 
cgit v0.10.2


From 4b656b1202498184a0ecef86b3b89ff613b9c6ab Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 21 Jul 2009 12:47:45 -0300
Subject: KVM: SVM: force new asid on vcpu migration

If a migrated vcpu matches the asid_generation value of the target pcpu,
there will be no TLB flush via TLB_CONTROL_FLUSH_ALL_ASID.

The check for vcpu.cpu in pre_svm_run is meaningless since svm_vcpu_load
already updated it on schedule in.

Such vcpu will VMRUN with stale TLB entries.

Based on original patch from Joerg Roedel (http://patchwork.kernel.org/patch/10021/)

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 71510e0..b1f658a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		svm->vmcb->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
 		kvm_migrate_timers(vcpu);
+		svm->asid_generation = 0;
 	}
 
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
 		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
 	}
 
-	svm->vcpu.cpu = svm_data->cpu;
 	svm->asid_generation = svm_data->asid_generation;
 	svm->vmcb->control.asid = svm_data->next_asid++;
 }
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm)
 	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
 
 	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-	if (svm->vcpu.cpu != cpu ||
-	    svm->asid_generation != svm_data->asid_generation)
+	/* FIXME: handle wraparound of asid_generation */
+	if (svm->asid_generation != svm_data->asid_generation)
 		new_asid(svm, svm_data);
 }
 
-- 
cgit v0.10.2


From 025dbbf36a7680bffe54d9dcbf0a8bc01a7cbd10 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 22 Jul 2009 13:05:49 -0300
Subject: KVM: MMU: handle n_free_mmu_pages > n_alloc_mmu_pages in
 kvm_mmu_change_mmu_pages

kvm_mmu_change_mmu_pages mishandles the case where n_alloc_mmu_pages is
smaller then n_free_mmu_pages, by not checking if the result of
the subtraction is negative.

Its a valid condition which can happen if a large number of pages has
been recently freed.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7030b5f..49a10d0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1407,24 +1407,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  */
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
+	int used_pages;
+
+	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+	used_pages = max(0, used_pages);
+
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
 	 * change the value
 	 */
 
-	if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
-	    kvm_nr_mmu_pages) {
-		int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
-				       - kvm->arch.n_free_mmu_pages;
-
-		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+	if (used_pages > kvm_nr_mmu_pages) {
+		while (used_pages > kvm_nr_mmu_pages) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 			kvm_mmu_zap_page(kvm, page);
-			n_used_mmu_pages--;
+			used_pages--;
 		}
 		kvm->arch.n_free_mmu_pages = 0;
 	}
-- 
cgit v0.10.2


From 34f0c1ad27a74bd5eb0f99ea43ab6a4658d6419d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Wed, 22 Jul 2009 23:53:26 +0200
Subject: KVM: VMX: Fix locking order in handle_invalid_guest_state

Release and re-acquire preemption and IRQ lock in the same order as
vcpu_enter_guest does.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 356a0ce..6bf58c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	enum emulation_result err = EMULATE_DONE;
 
-	preempt_enable();
 	local_irq_enable();
+	preempt_enable();
 
 	while (!guest_state_valid(vcpu)) {
 		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 			schedule();
 	}
 
-	local_irq_disable();
 	preempt_disable();
+	local_irq_disable();
 
 	vmx->invalid_state_emulation_result = err;
 }
-- 
cgit v0.10.2


From 263799a3616242201e20fd2025fe84047b1379b1 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 21 Jul 2009 10:43:07 +0200
Subject: KVM: VMX: Fix locking imbalance on emulation failure

We have to disable preemption and IRQs on every exit from
handle_invalid_guest_state, otherwise we generate at least a
preempt_disable imbalance.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6bf58c0..29f9129 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 
 		if (err != EMULATE_DONE) {
 			kvm_report_emulation_failure(vcpu, "emulation failure");
-			return;
+			break;
 		}
 
 		if (signal_pending(current))
-- 
cgit v0.10.2


From d3bc2f91b4761a8d9f96bea167fef2f8c00dea54 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 16 Jul 2009 17:17:37 +0200
Subject: KVM: s390: fix wait_queue handling

There are two waitqueues in kvm for wait handling:
vcpu->wq for virt/kvm/kvm_main.c and
vpcu->arch.local_int.wq for the s390 specific wait code.

the wait handling in kvm_s390_handle_wait was broken by using different
wait_queues for add_wait queue and remove_wait_queue.

There are two options to fix the problem:
o  move all the s390 specific code to vcpu->wq and remove
   vcpu->arch.local_int.wq
o  move all the s390 specific code to vcpu->arch.local_int.wq

This patch chooses the 2nd variant for two reasons:
o  s390 does not use kvm_vcpu_block but implements its own enabled wait
   handling.
   Having a separate wait_queue make it clear, that our wait mechanism is
   different
o  the patch is much smaller

Report-by:  Julia Lawall <julia@diku.dk>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f04f530..4d61341 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -386,7 +386,7 @@ no_timer:
 	}
 	__unset_cpu_idle(vcpu);
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&vcpu->wq, &wait);
+	remove_wait_queue(&vcpu->arch.local_int.wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
 	spin_unlock(&vcpu->arch.local_int.float_int->lock);
 	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
-- 
cgit v0.10.2


From 5116d8f6b977970ebefc1932c0f313163a6ec91f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 26 Jul 2009 17:10:01 +0300
Subject: KVM: fix ack not being delivered when msi present

kvm_notify_acked_irq does not check irq type, so that it sometimes
interprets msi vector as irq.  As a result, ack notifiers are not
called, which typially hangs the guest.  The fix is to track and
check irq type.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 16713dc..3060bdc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -110,6 +110,7 @@ struct kvm_memory_slot {
 
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
+	u32 type;
 	int (*set)(struct kvm_kernel_irq_routing_entry *e,
 		    struct kvm *kvm, int level);
 	union {
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a8bd466..ddc17f0 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -160,7 +160,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	unsigned gsi = pin;
 
 	list_for_each_entry(e, &kvm->irq_routing, link)
-		if (e->irqchip.irqchip == irqchip &&
+		if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
+		    e->irqchip.irqchip == irqchip &&
 		    e->irqchip.pin == pin) {
 			gsi = e->gsi;
 			break;
@@ -259,6 +260,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 	int delta;
 
 	e->gsi = ue->gsi;
+	e->type = ue->type;
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		delta = 0;
-- 
cgit v0.10.2


From c428dcc9b9f967945992a2f8529e8c50a31d7913 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 17 Jun 2009 15:04:19 +1000
Subject: KVM: Make KVM_HPAGES_PER_HPAGE unsigned long to avoid build error on
 powerpc

Eliminates this compiler warning:

arch/powerpc/kvm/../../../virt/kvm/kvm_main.c:1178: error: integer overflow in expression

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index dfdf13c..fddc3ed 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -34,7 +34,7 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 /* We don't currently support large pages. */
-#define KVM_PAGES_PER_HPAGE (1<<31)
+#define KVM_PAGES_PER_HPAGE (1UL << 31)
 
 struct kvm;
 struct kvm_run;
-- 
cgit v0.10.2


From e9cbde8c158629cc96a26b2323c4a243536c1951 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 6 Jul 2009 12:49:39 +0300
Subject: KVM: ia64: fix build failures due to ia64/unsigned long mismatches

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index 21f63ff..9bf55af 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -247,7 +247,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
 		vcpu_get_fpreg(vcpu, inst.M9.f2, &v);
 		/* Write high word. FIXME: this is a kludge!  */
 		v.u.bits[1] &= 0x3ffff;
-		mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE);
+		mmio_access(vcpu, padr + 8, (u64 *)&v.u.bits[1], 8,
+			    ma, IOREQ_WRITE);
 		data = v.u.bits[0];
 		size = 3;
 	} else if (inst.M10.major == 7 && inst.M10.x6 == 0x3B) {
@@ -265,7 +266,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
 
 		/* Write high word.FIXME: this is a kludge!  */
 		v.u.bits[1] &= 0x3ffff;
-		mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE);
+		mmio_access(vcpu, padr + 8, (u64 *)&v.u.bits[1],
+			    8, ma, IOREQ_WRITE);
 		data = v.u.bits[0];
 		size = 3;
 	} else if (inst.M10.major == 7 && inst.M10.x6 == 0x31) {
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index 46b02cb..cc406d0 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -461,7 +461,7 @@ void setreg(unsigned long regnum, unsigned long val,
 u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg)
 {
 	struct kvm_pt_regs *regs = vcpu_regs(vcpu);
-	u64 val;
+	unsigned long val;
 
 	if (!reg)
 		return 0;
@@ -469,7 +469,7 @@ u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg)
 	return val;
 }
 
-void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 value, int nat)
+void vcpu_set_gr(struct kvm_vcpu *vcpu, unsigned long reg, u64 value, int nat)
 {
 	struct kvm_pt_regs *regs = vcpu_regs(vcpu);
 	long sof = (regs->cr_ifs) & 0x7f;
@@ -1072,7 +1072,7 @@ void kvm_ttag(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_gr(vcpu, inst.M46.r1, tag, 0);
 }
 
-int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, u64 *padr)
+int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, unsigned long *padr)
 {
 	struct thash_data *data;
 	union ia64_isr visr, pt_isr;
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index 042af92..360724d 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -686,14 +686,15 @@ static inline int highest_inservice_irq(struct kvm_vcpu *vcpu)
 	return highest_bits((int *)&(VMX(vcpu, insvc[0])));
 }
 
-extern void vcpu_get_fpreg(struct kvm_vcpu *vcpu, u64 reg,
+extern void vcpu_get_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
 					struct ia64_fpreg *val);
-extern void vcpu_set_fpreg(struct kvm_vcpu *vcpu, u64 reg,
+extern void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
 					struct ia64_fpreg *val);
-extern u64 vcpu_get_gr(struct kvm_vcpu *vcpu, u64 reg);
-extern void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 val, int nat);
-extern u64 vcpu_get_psr(struct kvm_vcpu *vcpu);
-extern void vcpu_set_psr(struct kvm_vcpu *vcpu, u64 val);
+extern u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg);
+extern void vcpu_set_gr(struct kvm_vcpu *vcpu, unsigned long reg,
+			u64 val, int nat);
+extern unsigned long vcpu_get_psr(struct kvm_vcpu *vcpu);
+extern void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val);
 extern u64 vcpu_thash(struct kvm_vcpu *vcpu, u64 vadr);
 extern void vcpu_bsw0(struct kvm_vcpu *vcpu);
 extern void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte,
-- 
cgit v0.10.2


From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Aug 2009 12:02:48 -0400
Subject: ring-buffer: fix check of try_to_discard result

The function ring_buffer_discard_commit inversed the code path
of the result of try_to_discard. It should skip incrementing the
entry counter if try_to_discard succeeded. But instead, it increments
the entry conder if it succeeded to discard, and does not increment
it if it fails.

The result of this bug is that filtering will make the stat counters
incorrect.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bf27bb7..2fd1752 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 */
 	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
 
-	if (!rb_try_to_discard(cpu_buffer, event))
+	if (rb_try_to_discard(cpu_buffer, event))
 		goto out;
 
 	/*
-- 
cgit v0.10.2


From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Aug 2009 15:26:37 -0400
Subject: ring-buffer: do not disable ring buffer on oops_in_progress

The commit:

  commit e0fdace10e75dac67d906213b780ff1b1a4cc360
  Author: David Miller <davem@davemloft.net>
  Date:   Fri Aug 1 01:11:22 2008 -0700

    debug_locks: set oops_in_progress if we will log messages.

    Otherwise lock debugging messages on runqueue locks can deadlock the
    system due to the wakeups performed by printk().

    Signed-off-by: David S. Miller <davem@davemloft.net>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

Will permanently set oops_in_progress on any lockdep failure.
When this triggers it will cause any read from the ring buffer to
permanently disable the ring buffer (not to mention no locking of
printk).

This patch removes the check. It keeps the print in NMI which makes
sense. This is probably OK, since the ring buffer should not cause
something to set oops_in_progress anyway.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2fd1752..2606cee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void)
 	 * buffer too. A one time deal is all you get from reading
 	 * the ring buffer from an NMI.
 	 */
-	if (likely(!in_nmi() && !oops_in_progress))
+	if (likely(!in_nmi()))
 		return 1;
 
 	tracing_off_permanent();
-- 
cgit v0.10.2


From 3f6e968ef4e1d8d93d8a8505461b0e50a9e97ad8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 5 Aug 2009 22:00:14 -0400
Subject: tracing: do not use functions starting with .L in recordmcount.pl

On Wed, 5 Aug 2009, Ingo Molnar wrote:
> * Dave Airlie <airlied@gmail.com> wrote:
>
> > Hey,
> >
> > So I spent 3-4 hrs today (I'm stupid yes) tracking down a .o
> > breakage by blaming rawhide gcc/binutils as I was using make
> > V=1and seeing only the compiler chain running,
>
> Hm, is this that powerpc related build bug you just reported?

Well we tracked it down and it is powerpc64 specific.

Seems that in drivers/hwmon/lm93.c there's a function called:

   LM93_IN_FROM_REG()

But PPC64 has function descriptors and the real function names (the ones
you see in objdump) start with a '.'. Thus this in objdump you have:

 Disassembly of section .text:

 0000000000000000 <.LM93_IN_FROM_REG>:
       0:       7c 08 02 a6     mflr    r0
       4:       fb 81 ff e0     std     r28,-32(r1)

The function name used is .LM93_IN_FROM_REG. But gcc considers symbols
that start with ".L" as a special symbol that is used inside the assembly
stage.

The nm passed into recordmcount uses the --synthetic option which shows
the ".L" symbols (my runs outside of the build did not include the
--synthetic option, so my older patch worked). We see the function as a
local.

Now to capture all the locations that use "mcount" we need to have a
reference to link into the object file a list of mcount callers. We need a
reference that will not disappear. We try to use a global function and if
that does not work, we use a local function as a reference. But to relink
the section back into the object, we need to make it global. In this case,
we run objcopy using --globalize-symbol and --localize-symbol to convert
the symbol into a global symbol, link the mcount list, then convert it
back to a local symbol.

This works great except for this case. .L* symbols can not be converted
into a global symbol, and the mcount section referencing it will remain
unresolved.

Reported-by: Dave Airlie <airlied@gmail.com>
LKML-Reference: <alpine.DEB.2.00.0908052011590.5010@gandalf.stny.rr.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index d29baa2..4889c44 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -414,7 +414,10 @@ while (<IN>) {
 	    $offset = hex $1;
 	} else {
 	    # if we already have a function, and this is weak, skip it
-	    if (!defined($ref_func) && !defined($weak{$text})) {
+	    if (!defined($ref_func) && !defined($weak{$text}) &&
+		 # PPC64 can have symbols that start with .L and
+		 # gcc considers these special. Don't use them!
+		 $text !~ /^\.L/) {
 		$ref_func = $text;
 		$offset = hex $1;
 	    }
-- 
cgit v0.10.2


From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Wed, 5 Aug 2009 12:05:21 -0700
Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock()

In the event of a lock steal or owner died,
rt_mutex_start_proxy_lock() will give the rt_mutex to the
waiting task, but it fails to release the wait_lock. This leads
to subsequent deadlocks when other tasks try to acquire the
rt_mutex.

I also removed a few extra blank lines that really spaced this
routine out. I must have been high on the \n when I wrote this
originally...

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4A79D7F1.4000405@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a..29bd4ba 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
 		/* We got the lock for task. */
 		debug_rt_mutex_lock(lock);
-
 		rt_mutex_set_owner(lock, task, 0);
-
+		spin_unlock(&lock->wait_lock);
 		rt_mutex_deadlock_account_lock(lock, task);
 		return 1;
 	}
 
 	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
 
-
 	if (ret && !waiter->task) {
 		/*
 		 * Reset the return value. We might have
-- 
cgit v0.10.2


From 53a27b39ff4d2492f84b1fdc2f0047175f0b0b93 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 5 Aug 2009 15:43:58 -0300
Subject: KVM: MMU: limit rmap chain length

Otherwise the host can spend too long traversing an rmap chain, which
happens under a spinlock.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 49a10d0..0ef5bb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
  *
  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
  * containing more mappings.
+ *
+ * Returns the number of rmap entries before the spte was added or zero if
+ * the spte was not added.
+ *
  */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 {
 	struct kvm_mmu_page *sp;
 	struct kvm_rmap_desc *desc;
 	unsigned long *rmapp;
-	int i;
+	int i, count = 0;
 
 	if (!is_rmap_pte(*spte))
-		return;
+		return count;
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	sp->gfns[spte - sp->spt] = gfn;
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 	} else {
 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
 			desc = desc->more;
+			count += RMAP_EXT;
+		}
 		if (desc->shadow_ptes[RMAP_EXT-1]) {
 			desc->more = mmu_alloc_rmap_desc(vcpu);
 			desc = desc->more;
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 			;
 		desc->shadow_ptes[i] = spte;
 	}
+	return count;
 }
 
 static void rmap_desc_remove_entry(unsigned long *rmapp,
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 	return young;
 }
 
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+{
+	unsigned long *rmapp;
+
+	gfn = unalias_gfn(vcpu->kvm, gfn);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+
+	kvm_unmap_rmapp(vcpu->kvm, rmapp);
+	kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
 	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
@@ -1741,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
+	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
 		 " user_fault %d gfn %lx\n",
@@ -1782,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
 	if (!was_rmapped) {
-		rmap_add(vcpu, shadow_pte, gfn, largepage);
+		rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
 		if (!is_rmap_pte(*shadow_pte))
 			kvm_release_pfn_clean(pfn);
+		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+			rmap_recycle(vcpu, gfn, largepage);
 	} else {
 		if (was_writeble)
 			kvm_release_pfn_dirty(pfn);
-- 
cgit v0.10.2


From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 30 Jul 2009 19:19:18 +0200
Subject: ring-buffer: Fix advance of reader in rb_buffer_peek()

When calling rb_buffer_peek() from ring_buffer_consume() and a
padding event is returned, the function rb_advance_reader() is
called twice. This may lead to missing samples or under high
workloads to the warning below. This patch fixes this. If a padding
event is returned by rb_buffer_peek() it will be consumed by the
calling function now.

Also, I simplified some code in ring_buffer_consume().

------------[ cut here ]------------
WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5()
Hardware name: Anaheim
Modules linked in:
Pid: 29, comm: events/2 Tainted: G        W  2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1
Call Trace:
[<ffffffff8106776f>] ? rb_advance_reader+0x2e/0xc5
[<ffffffff81039ffe>] warn_slowpath_common+0x77/0x8f
[<ffffffff8103a025>] warn_slowpath_null+0xf/0x11
[<ffffffff8106776f>] rb_advance_reader+0x2e/0xc5
[<ffffffff81068bda>] ring_buffer_consume+0xa0/0xd2
[<ffffffff81326933>] op_cpu_buffer_read_entry+0x21/0x9e
[<ffffffff810be3af>] ? __find_get_block+0x4b/0x165
[<ffffffff8132749b>] sync_buffer+0xa5/0x401
[<ffffffff810be3af>] ? __find_get_block+0x4b/0x165
[<ffffffff81326c1b>] ? wq_sync_buffer+0x0/0x78
[<ffffffff81326c76>] wq_sync_buffer+0x5b/0x78
[<ffffffff8104aa30>] worker_thread+0x113/0x1ac
[<ffffffff8104dd95>] ? autoremove_wake_function+0x0/0x38
[<ffffffff8104a91d>] ? worker_thread+0x0/0x1ac
[<ffffffff8104dc9a>] kthread+0x88/0x92
[<ffffffff8100bdba>] child_rip+0xa/0x20
[<ffffffff8104dc12>] ? kthread+0x0/0x92
[<ffffffff8100bdb0>] ? child_rip+0x0/0x20
---[ end trace f561c0a58fcc89bd ]---

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2606cee..d4d3580 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 		 * the box. Return the padding, and we will release
 		 * the current locks, and try again.
 		 */
-		rb_advance_reader(cpu_buffer);
 		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
@@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 	event = rb_buffer_peek(buffer, cpu, ts);
+	if (event && event->type_len == RINGBUF_TYPE_PADDING)
+		rb_advance_reader(cpu_buffer);
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
 	local_irq_restore(flags);
@@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 		spin_lock(&cpu_buffer->reader_lock);
 
 	event = rb_buffer_peek(buffer, cpu, ts);
-	if (!event)
-		goto out_unlock;
-
-	rb_advance_reader(cpu_buffer);
+	if (event)
+		rb_advance_reader(cpu_buffer);
 
- out_unlock:
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
 	local_irq_restore(flags);
-- 
cgit v0.10.2


From 7dbdee2e9a2ac42ea5135801bcc9d1a8e3f672aa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Aug 2009 19:53:18 -0400
Subject: tracing: Fix recordmcount.pl to handle sections with only weak
 functions

Roland Dreier found that a section that contained only a weak
function in one of the staging drivers and this caused
recordmcount.pl to spit out a warning and fail.

Although it is strange that a driver would have a weak function, and
this function only be used in one place, it should not be something
to make recordmcount.pl fail.

This patch fixes the issue in a simple manner: if only weak
functions exist in a section, then that section will not be
recorded.

Reported-by: Roland Dreier <rdreier@cisco.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 4889c44..911ba7f 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -393,7 +393,7 @@ while (<IN>) {
 	    $read_function = 0;
 	}
 	# print out any recorded offsets
-	update_funcs() if ($text_found);
+	update_funcs() if (defined($ref_func));
 
 	# reset all markers and arrays
 	$text_found = 0;
@@ -444,7 +444,7 @@ while (<IN>) {
 }
 
 # dump out anymore offsets that may have been found
-update_funcs() if ($text_found);
+update_funcs() if (defined($ref_func));
 
 # If we did not find any mcount callers, we are done (do nothing).
 if (!$opened) {
-- 
cgit v0.10.2


From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 Aug 2009 16:37:10 +0800
Subject: lockdep: Fix file mode of lock_stat

/proc/lock_stat is writable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa..e94caa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
 		    &proc_lockdep_stats_operations);
 
 #ifdef CONFIG_LOCK_STAT
-	proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+		    &proc_lock_stat_operations);
 #endif
 
 	return 0;
-- 
cgit v0.10.2


From 0e692a94e378628b7d527260ad939894454bcca8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 Aug 2009 15:10:54 +0800
Subject: lockdep: Fix typos in documentation

s/head/held

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A7BD37E.9060806@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index e20d913..abf768c 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -30,9 +30,9 @@ State
 The validator tracks lock-class usage history into 4n + 1 separate state bits:
 
 - 'ever held in STATE context'
-- 'ever head as readlock in STATE context'
-- 'ever head with STATE enabled'
-- 'ever head as readlock with STATE enabled'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
 
 Where STATE can be either one of (kernel/lockdep_states.h)
  - hardirq
-- 
cgit v0.10.2


From afc5e65245255a268ab22a20477ed2c9f2cdfcd3 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 7 Aug 2009 16:33:53 +0200
Subject: ASoC: Add missing DRV_NAME definitions for fsl/* drivers

Module builds are broken due to missing DRV_NAME for
efika-audio-fabric and pcm030-audio-fabric.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/soc/fsl/efika-audio-fabric.c b/sound/soc/fsl/efika-audio-fabric.c
index 85b0e75..3326e2a 100644
--- a/sound/soc/fsl/efika-audio-fabric.c
+++ b/sound/soc/fsl/efika-audio-fabric.c
@@ -30,6 +30,8 @@
 #include "mpc5200_psc_ac97.h"
 #include "../codecs/stac9766.h"
 
+#define DRV_NAME "efika-audio-fabric"
+
 static struct snd_soc_device device;
 static struct snd_soc_card card;
 
diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c
index 8766f7a..b928ef7 100644
--- a/sound/soc/fsl/pcm030-audio-fabric.c
+++ b/sound/soc/fsl/pcm030-audio-fabric.c
@@ -30,6 +30,8 @@
 #include "mpc5200_psc_ac97.h"
 #include "../codecs/wm9712.h"
 
+#define DRV_NAME "pcm030-audio-fabric"
+
 static struct snd_soc_device device;
 static struct snd_soc_card card;
 
-- 
cgit v0.10.2


From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 7 Aug 2009 12:49:29 +0200
Subject: ring-buffer: Fix memleak in ring_buffer_free()

I noticed oprofile memleaked in linux-2.6 current tree,
and tracked this ring-buffer leak.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
LKML-Reference: <4A7C06B9.2090302@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4d3580..a330513 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
 
 	put_online_cpus();
 
+	kfree(buffer->buffers);
 	free_cpumask_var(buffer->cpumask);
 
 	kfree(buffer);
-- 
cgit v0.10.2


From d25f14389a65c7f95512b01415d8d4a8d62855ab Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 27 Jul 2009 12:05:38 +0900
Subject: PCI hotplug: SGI hotplug: fix build failure

The commit bd3d99c17039fd05a29587db3f4a180c48da115a ("PCI: Remove
untested Electromechanical Interlock (EMI) support in pciehp."), which
removes the definition of "struct hotplug_slot_attr", broke SGI
hotplug driver. By this commit, we get the following compile error.

drivers/pci/hotplug/sgi_hotplug.c:106: error: variable 'sn_slot_path_attr' has initializer but incomplete type
drivers/pci/hotplug/sgi_hotplug.c:106: error: unknown field 'attr' specified in initializer
drivers/pci/hotplug/sgi_hotplug.c:106: error: extra brace group at end of initializer
drivers/pci/hotplug/sgi_hotplug.c:106: error: (near initialization for 'sn_slot_path_attr')
drivers/pci/hotplug/sgi_hotplug.c:106: warning: excess elements in struct initializer
drivers/pci/hotplug/sgi_hotplug.c:106: warning: (near initialization for 'sn_slot_path_attr')
drivers/pci/hotplug/sgi_hotplug.c:106: error: unknown field 'show' specified in initializer
drivers/pci/hotplug/sgi_hotplug.c:106: warning: excess elements in struct initializer
drivers/pci/hotplug/sgi_hotplug.c:106: warning: (near initialization for 'sn_slot_path_attr')
drivers/pci/hotplug/sgi_hotplug.c: In function 'sn_hp_destroy':
drivers/pci/hotplug/sgi_hotplug.c:203: error: invalid use of undefined type 'struct hotplug_slot_attribute'
drivers/pci/hotplug/sgi_hotplug.c: In function 'sn_hotplug_slot_register':
drivers/pci/hotplug/sgi_hotplug.c:655: error: invalid use of undefined type 'struct hotplug_slot_attribute'

This patch fixes this regression by adding the definition of struct
hotplug_slot_attr into sgi_hotplug.c.

Tested-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c
index a4494d7..0a724bb 100644
--- a/drivers/pci/hotplug/sgi_hotplug.c
+++ b/drivers/pci/hotplug/sgi_hotplug.c
@@ -103,6 +103,12 @@ static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot,
 	return retval;
 }
 
+struct hotplug_slot_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct hotplug_slot *, char *);
+	ssize_t (*store)(struct hotplug_slot *, const char *, size_t);
+};
+
 static struct hotplug_slot_attribute sn_slot_path_attr = __ATTR_RO(path);
 
 static int sn_pci_slot_valid(struct pci_bus *pci_bus, int device)
-- 
cgit v0.10.2


From 94f81a47c4a7a2d7a16fcfdd6d81da381732c101 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 27 Jul 2009 12:06:46 +0900
Subject: PCI hotplug: SGI hotplug: do not use hotplug_slot_attr

By the pci slot changes, callbacks of attributes under slot directory
(/sys/bus/pci/slots) had been changed to get the pointer to struct
pci_slot instead of struct hotplug_slot. So the path_show() that
assumes the parameter is a pointer to struct hotplug_slot seems
broken.

Tested-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c
index 0a724bb..8aebe1e9 100644
--- a/drivers/pci/hotplug/sgi_hotplug.c
+++ b/drivers/pci/hotplug/sgi_hotplug.c
@@ -90,11 +90,10 @@ static struct hotplug_slot_ops sn_hotplug_slot_ops = {
 
 static DEFINE_MUTEX(sn_hotplug_mutex);
 
-static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot,
-	       		  char *buf)
+static ssize_t path_show(struct pci_slot *pci_slot, char *buf)
 {
 	int retval = -ENOENT;
-	struct slot *slot = bss_hotplug_slot->private;
+	struct slot *slot = pci_slot->hotplug->private;
 
 	if (!slot)
 		return retval;
@@ -103,13 +102,7 @@ static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot,
 	return retval;
 }
 
-struct hotplug_slot_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct hotplug_slot *, char *);
-	ssize_t (*store)(struct hotplug_slot *, const char *, size_t);
-};
-
-static struct hotplug_slot_attribute sn_slot_path_attr = __ATTR_RO(path);
+static struct pci_slot_attribute sn_slot_path_attr = __ATTR_RO(path);
 
 static int sn_pci_slot_valid(struct pci_bus *pci_bus, int device)
 {
-- 
cgit v0.10.2


From e7432675f8ca868a4af365759a8d4c3779a3d922 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 6 Aug 2009 16:12:58 -0700
Subject: ocfs2: Initialize the cluster we're writing to in a non-sparse extend

In a non-sparse extend, we correctly allocate (and zero) the clusters between
the old_i_size and pos, but we don't zero the portions of the cluster we're
writing to outside of pos<->len.

It handles clustersize > pagesize and blocksize < pagesize.

[Cleaned up by Joel Becker.]

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e511df1..b401654 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -895,18 +895,17 @@ struct ocfs2_write_cluster_desc {
 	 */
 	unsigned	c_new;
 	unsigned	c_unwritten;
+	unsigned	c_needs_zero;
 };
 
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
-	return d->c_new || d->c_unwritten;
-}
-
 struct ocfs2_write_ctxt {
 	/* Logical cluster position / len of write */
 	u32				w_cpos;
 	u32				w_clen;
 
+	/* First cluster allocated in a nonsparse extend */
+	u32				w_first_new_cpos;
+
 	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
 	/*
@@ -984,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 		return -ENOMEM;
 
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_first_new_cpos = UINT_MAX;
 	cend = (pos + len - 1) >> osb->s_clustersize_bits;
 	wc->w_clen = cend - wc->w_cpos + 1;
 	get_bh(di_bh);
@@ -1218,20 +1218,18 @@ out:
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
 			       u32 phys, unsigned int unwritten,
+			       unsigned int should_zero,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
-	int ret, i, new, should_zero = 0;
+	int ret, i, new;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 	struct ocfs2_extent_tree et;
 
 	new = phys == 0 ? 1 : 0;
-	if (new || unwritten)
-		should_zero = 1;
-
 	if (new) {
 		u32 tmp_pos;
 
@@ -1342,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 			local_len = osb->s_clustersize - cluster_off;
 
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
-					  desc->c_unwritten, data_ac, meta_ac,
+					  desc->c_unwritten,
+					  desc->c_needs_zero,
+					  data_ac, meta_ac,
 					  wc, desc->c_cpos, pos, local_len);
 		if (ret) {
 			mlog_errno(ret);
@@ -1392,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 		 * newly allocated cluster.
 		 */
 		desc = &wc->w_desc[0];
-		if (ocfs2_should_zero_cluster(desc))
+		if (desc->c_needs_zero)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							&wc->w_target_from,
 							NULL);
 
 		desc = &wc->w_desc[wc->w_clen - 1];
-		if (ocfs2_should_zero_cluster(desc))
+		if (desc->c_needs_zero)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							NULL,
@@ -1467,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 			phys++;
 		}
 
+		/*
+		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+		 * file that got extended.  w_first_new_cpos tells us
+		 * where the newly allocated clusters are so we can
+		 * zero them.
+		 */
+		if (desc->c_cpos >= wc->w_first_new_cpos) {
+			BUG_ON(phys == 0);
+			desc->c_needs_zero = 1;
+		}
+
 		desc->c_phys = phys;
 		if (phys == 0) {
 			desc->c_new = 1;
+			desc->c_needs_zero = 1;
 			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
-		if (ext_flags & OCFS2_EXT_UNWRITTEN)
+
+		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
 			desc->c_unwritten = 1;
+			desc->c_needs_zero = 1;
+		}
 
 		num_clusters--;
 	}
@@ -1633,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
 	if (newsize <= i_size_read(inode))
 		return 0;
 
-	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+	ret = ocfs2_extend_no_holes(inode, newsize, pos);
 	if (ret)
 		mlog_errno(ret);
 
+	wc->w_first_new_cpos =
+		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
 	return ret;
 }
 
@@ -1645,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
-	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int clusters_to_alloc, extents_to_split;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
@@ -1723,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	}
 
-	ocfs2_set_target_boundaries(osb, wc, pos, len,
-				    clusters_to_alloc + extents_to_split);
+	/*
+	 * We have to zero sparse allocated clusters, unwritten extent clusters,
+	 * and non-sparse clusters we just extended.  For non-sparse writes,
+	 * we know zeros will only be needed in the first and/or last cluster.
+	 */
+	if (clusters_to_alloc || extents_to_split ||
+	    wc->w_desc[0].c_needs_zero ||
+	    wc->w_desc[wc->w_clen - 1].c_needs_zero)
+		cluster_of_pages = 1;
+	else
+		cluster_of_pages = 0;
+
+	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -1757,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc + extents_to_split,
-					 mmap_page);
+					 cluster_of_pages, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_quota;
-- 
cgit v0.10.2


From 8a57a9dda760ea7845390f1cd36f3eb2a49391d8 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 6 Aug 2009 16:07:50 -0700
Subject: ocfs2: keep index within status_map[]

Do not exceed array status_map[]

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f66137..e49c410 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
  * General Public License for more details.
  */
 
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/module.h>
 
@@ -153,7 +154,7 @@ static int status_map[] = {
 
 static int dlm_status_to_errno(enum dlm_status status)
 {
-	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+	BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
 
 	return status_map[status];
 }
-- 
cgit v0.10.2


From 087d7e56deffb611a098e7e257388a41edbeef1f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 4 Aug 2009 08:59:59 -0700
Subject: x86: Fix MSI-X initialization by using online_mask for x2apic
 target_cpus

found a system where x2apic reports an MSI-X irq initialization
failure:

[  302.859446] igbvf 0000:81:10.4: enabling device (0000 -> 0002)
[  302.874369] igbvf 0000:81:10.4: using 64bit DMA mask
[  302.879023] igbvf 0000:81:10.4: using 64bit consistent DMA mask
[  302.894386] igbvf 0000:81:10.4: enabling bus mastering
[  302.898171] igbvf 0000:81:10.4: setting latency timer to 64
[  302.914050] reserve_memtype added 0xefb08000-0xefb0c000, track uncached-minus, req uncached-minus, ret uncached-minus
[  302.933839] reserve_memtype added 0xefb28000-0xefb29000, track uncached-minus, req uncached-minus, ret uncached-minus
[  302.940367]   alloc irq_desc for 265 on node 4
[  302.956874]   alloc kstat_irqs on node 4
[  302.959452] alloc irq_2_iommu on node 0
[  302.974328] igbvf 0000:81:10.4: irq 265 for MSI/MSI-X
[  302.977778]   alloc irq_desc for 266 on node 4
[  302.980347]   alloc kstat_irqs on node 4
[  302.995312] free_memtype request 0xefb28000-0xefb29000
[  302.998816] igbvf 0000:81:10.4: Failed to initialize MSI-X interrupts.

... it turns out that when trying to enable MSI-X,
__assign_irq_vector(new, cfg_new, apic->target_cpus()) can not
get vector because for x2apic target-cpus returns cpumask_of(0)

Update that to online_mask like xapic.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4A785AFF.3050902@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 2ed4e2b..a5371ec 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return x2apic_enabled();
 }
 
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-	return cpumask_of(0);
+	return cpu_online_mask;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 0b631c6..a8989aa 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		return 0;
 }
 
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-	return cpumask_of(0);
+	return cpu_online_mask;
 }
 
 static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-- 
cgit v0.10.2


From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 4 Aug 2009 09:01:33 -0700
Subject: x86/irq: Fix move_irq_desc() for nodes without ram

Don't move it if target node is -1.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A785B5D.4070702@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee..3fd3019 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
 
 struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-	/* those all static, do move them */
-	if (desc->irq < NR_IRQS_LEGACY)
+	/* those static or target node is -1, do not move them */
+	if (desc->irq < NR_IRQS_LEGACY || node == -1)
 		return desc;
 
 	if (desc->node != node)
-- 
cgit v0.10.2


From 498cdbfbcf98e0d2c90a26e6a02a82f043876e48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ozan=20=C3=87a=C4=9Flayan?= <ozan@pardus.org.tr>
Date: Tue, 4 Aug 2009 19:39:31 +0300
Subject: x86: Add quirk to make Apple MacBookPro5,1 use reboot=pci
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MacBookPro5,1 is not able to reboot unless reboot=pci is set.
This patch forces it through a DMI quirk specific to this
device.

Signed-off-by: Ozan Çağlayan <ozan@pardus.org.tr>
LKML-Reference: <1249403971-6543-1-git-send-email-ozan@pardus.org.tr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 834c9da..9eb8976 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -405,7 +405,7 @@ EXPORT_SYMBOL(machine_real_restart);
 #endif /* CONFIG_X86_32 */
 
 /*
- * Apple MacBook5,2 (2009 MacBook) needs reboot=p
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
  */
 static int __init set_pci_reboot(const struct dmi_system_id *d)
 {
@@ -426,6 +426,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
 		},
 	},
+	{	/* Handle problems with rebooting on Apple MacBookPro5,1 */
+		.callback = set_pci_reboot,
+		.ident = "Apple MacBookPro5,1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v0.10.2


From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sat, 8 Aug 2009 10:49:09 -0500
Subject: tracing/filters: Don't use pred on alloc failure

Dan Carpenter sent me a fix to prevent pred from being used if
it couldn't be allocated.  I noticed the same problem also
existed for the create_pred() case and added a fix for that.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1249746549.6453.29.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621..1557148 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system,
 
 		if (elt->op == OP_AND || elt->op == OP_OR) {
 			pred = create_logical_pred(elt->op);
+			if (!pred)
+				return -ENOMEM;
 			if (call) {
 				err = filter_add_pred(ps, call, pred);
 				filter_free_pred(pred);
@@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system,
 		}
 
 		pred = create_pred(elt->op, operand1, operand2);
+		if (!pred)
+			return -ENOMEM;
 		if (call) {
 			err = filter_add_pred(ps, call, pred);
 			filter_free_pred(pred);
-- 
cgit v0.10.2


From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sat, 8 Aug 2009 10:49:53 -0500
Subject: tracing/filters: Always free pred on filter_add_subsystem_pred()
 failure

If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM,
the pred doesn't get freed, while as a side effect it does for
other errors. Make it so the caller always frees the pred for
any error.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1249746593.6453.32.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1557148..f32dc9d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 		return -ENOSPC;
 	}
 
-	filter->preds[filter->n_preds] = pred;
-	filter->n_preds++;
-
 	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->define_fields)
@@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
+
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 out:
 	return err;
 }
@@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system,
 			if (call) {
 				err = filter_add_pred(ps, call, pred);
 				filter_free_pred(pred);
-			} else
+			} else {
 				err = filter_add_subsystem_pred(ps, system,
 							pred, filter_string);
+				if (err)
+					filter_free_pred(pred);
+			}
 			if (err)
 				return err;
 
@@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system,
 		if (call) {
 			err = filter_add_pred(ps, call, pred);
 			filter_free_pred(pred);
-		} else
+		} else {
 			err = filter_add_subsystem_pred(ps, system, pred,
 							filter_string);
+			if (err)
+				filter_free_pred(pred);
+		}
 		if (err)
 			return err;
 
-- 
cgit v0.10.2


From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Thu, 6 Aug 2009 16:03:30 -0700
Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer()

When the process exits we don't have to run new cputimer nor
use running one (as it not accounts when tsk->exit_state != 0)
to get process CPU times.  As there is only one thread we can
just use CPU times fields from task and signal structs.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Roland McGrath <roland@redhat.com>
Cc: Vitaly Mayatskikh <vmayatsk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0..e33a21c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-	struct task_cputime cputime;
+	struct signal_struct *const sig = tsk->signal;
 
-	thread_group_cputimer(tsk, &cputime);
 	cleanup_timers(tsk->signal->cpu_timers,
-		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+		       cputime_add(tsk->utime, sig->utime),
+		       cputime_add(tsk->stime, sig->stime),
+		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
-- 
cgit v0.10.2


From 38d5487db7f289be1d56ac7df704ee49ed3213b9 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Mon, 20 Jul 2009 14:49:17 -0700
Subject: drm: When adding probed modes, preserve duplicate mode types

The code which takes probed modes and adds them to a connector eliminates
duplicate modes by comparing them using drm_mode_equal. That function
doesn't consider the type bits, which means that any modes which differ only
in the type field will be lost.

One of the bits in the mode->type field is the DRM_MODE_TYPE_PREFERRED bit.
If the mode with that bit is lost, then higher level code will not know
which mode to select, causing a random mode to be used instead.

This patch simply merges the two mode type bits together; that seems
reasonable to me, but perhaps only a subset of the bits should be used? None
of these can be user defined as they all come from looking at just the
hardware.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 54f492a..7914097 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -566,6 +566,8 @@ void drm_mode_connector_list_update(struct drm_connector *connector)
 				found_it = 1;
 				/* if equal delete the probed mode */
 				mode->status = pmode->status;
+				/* Merge type bits together */
+				mode->type |= pmode->type;
 				list_del(&pmode->head);
 				drm_mode_destroy(connector->dev, pmode);
 				break;
-- 
cgit v0.10.2


From 8d3457ec3198a569dd14dc9e3ae8b6163bcaa0b5 Mon Sep 17 00:00:00 2001
From: Paul Rolland <rol@as2917.net>
Date: Sun, 9 Aug 2009 12:24:01 +1000
Subject: drm: silence pointless vblank warning.

Some applications/hardware combinations are triggering the message "failed to
acquire vblank counter" to be issued up to 20 times a second, which makes it
both useless and dangerous, as this may hide other important messages.
This changes makes it only appear when people are debugging.

Signed-off-by: Paul Rolland <rol@as2917.net>
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Lost-twice-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index b4a3dbc..f85aaf2 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -566,7 +566,7 @@ int drm_wait_vblank(struct drm_device *dev, void *data,
 
 	ret = drm_vblank_get(dev, crtc);
 	if (ret) {
-		DRM_ERROR("failed to acquire vblank counter, %d\n", ret);
+		DRM_DEBUG("failed to acquire vblank counter, %d\n", ret);
 		return ret;
 	}
 	seq = drm_vblank_count(dev, crtc);
-- 
cgit v0.10.2


From 6cb504c29b1338925c83e4430e42a51eaa43781e Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Sun, 9 Aug 2009 12:25:29 +1000
Subject: drm/i915: silence vblank warnings

these errors are pretty pointless

Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 83aee80..7ebc84c 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -190,7 +190,7 @@ u32 i915_get_vblank_counter(struct drm_device *dev, int pipe)
 	low_frame = pipe ? PIPEBFRAMEPIXEL : PIPEAFRAMEPIXEL;
 
 	if (!i915_pipe_enabled(dev, pipe)) {
-		DRM_ERROR("trying to get vblank count for disabled pipe %d\n", pipe);
+		DRM_DEBUG("trying to get vblank count for disabled pipe %d\n", pipe);
 		return 0;
 	}
 
@@ -219,7 +219,7 @@ u32 gm45_get_vblank_counter(struct drm_device *dev, int pipe)
 	int reg = pipe ? PIPEB_FRMCOUNT_GM45 : PIPEA_FRMCOUNT_GM45;
 
 	if (!i915_pipe_enabled(dev, pipe)) {
-		DRM_ERROR("trying to get vblank count for disabled pipe %d\n", pipe);
+		DRM_DEBUG("trying to get vblank count for disabled pipe %d\n", pipe);
 		return 0;
 	}
 
-- 
cgit v0.10.2


From fdb8a42742ac95606668f73481dfb2f760658fdd Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 6 Aug 2009 15:58:13 -0700
Subject: x86: fix buffer overflow in efi_init()

If the vendor name (from c16) can be longer than 100 bytes (or missing a
terminating null), then the null is written past the end of vendor[].

Found with Parfait, http://research.sun.com/projects/parfait/

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 19ccf6d..fe26ba3 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -354,7 +354,7 @@ void __init efi_init(void)
 	 */
 	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
 	if (c16) {
-		for (i = 0; i < sizeof(vendor) && *c16; ++i)
+		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
 			vendor[i] = *c16++;
 		vendor[i] = '\0';
 	} else
-- 
cgit v0.10.2


From b4a2f5e723e4f7df46731106faf9e2405673c073 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 5 Jul 2009 18:48:11 +0300
Subject: KVM: Avoid redelivery of edge interrupt before next edge

The check for an edge is broken in current ioapic code. ioapic->irr is
cleared on each edge interrupt by ioapic_service() and this makes
old_irr != ioapic->irr condition in kvm_ioapic_set_irq() to be always
true. The patch fixes the code to properly recognise edge.

Some HW emulation calls set_irq() without level change. If each such
call is propagated to an OS it may confuse a device driver. This is the
case with keyboard device emulation and Windows XP x64  installer on SMP VM.
Each keystroke produce two interrupts (down/up) one interrupt is
submitted to CPU0 and another to CPU1. This confuses Windows somehow
and it ignores keystrokes.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1eddae9..1150c6d 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -95,8 +95,6 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
 			pent->fields.remote_irr = 1;
 	}
-	if (!pent->fields.trig_mode)
-		ioapic->irr &= ~(1 << idx);
 
 	return injected;
 }
@@ -136,7 +134,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		mask_after = ioapic->redirtbl[index].fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
-		if (ioapic->irr & (1 << index))
+		if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG
+		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
 		break;
 	}
@@ -184,9 +183,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 		if (!level)
 			ioapic->irr &= ~mask;
 		else {
+			int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
 			ioapic->irr |= mask;
-			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
-			    || !entry.fields.remote_irr)
+			if ((edge && old_irr != ioapic->irr) ||
+			    (!edge && !entry.fields.remote_irr))
 				ret = ioapic_service(ioapic, irq);
 		}
 	}
-- 
cgit v0.10.2


From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 17:34:57 +0200
Subject: perf_counter, ftrace: Fix perf_counter integration

Adds possible second part to the assign argument of TP_EVENT().

  TP_perf_assign(
	__perf_count(foo);
	__perf_addr(bar);
  )

Which, when specified make the swcounter increment with @foo instead
of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address
associated with the event) when this triggers a counter overflow.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1867553..fec71f8 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -144,6 +144,9 @@
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
 
+#undef TP_perf_assign
+#define TP_perf_assign(args...)
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
@@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call(				\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+#ifdef CONFIG_EVENT_PROFILE
+
+/*
+ * Generate the functions needed for tracepoint perf_counter support.
+ *
+ * static void ftrace_profile_<call>(proto)
+ * {
+ * 	extern void perf_tpcounter_event(int, u64, u64);
+ * 	u64 __addr = 0, __count = 1;
+ *
+ * 	<assign>   <-- here we expand the TP_perf_assign() macro
+ *
+ * 	perf_tpcounter_event(event_<call>.id, __addr, __count);
+ * }
+ *
+ * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
+ * {
+ * 	int ret = 0;
+ *
+ * 	if (!atomic_inc_return(&event_call->profile_count))
+ * 		ret = register_trace_<call>(ftrace_profile_<call>);
+ *
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_profile_disable_<call>(struct ftrace_event_call *event_call)
+ * {
+ * 	if (atomic_add_negative(-1, &event->call->profile_count))
+ * 		unregister_trace_<call>(ftrace_profile_<call>);
+ * }
+ *
+ */
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...)
+
+#undef TP_perf_assign
+#define TP_perf_assign(args...) args
+
+#undef __perf_addr
+#define __perf_addr(a) __addr = (a)
+
+#undef __perf_count
+#define __perf_count(c) __count = (c)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+									\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int, u64, u64);		\
+	u64 __addr = 0, __count = 1;					\
+	{ assign; }							\
+	perf_tpcounter_event(event_##call.id, __addr, __count);		\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&event_call->profile_count))		\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+{									\
+	if (atomic_add_negative(-1, &event_call->profile_count))	\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TP_perf_assign
+#define TP_perf_assign(args...)
+
+#endif
+
 /*
  * Stage 4 of the trace events.
  *
@@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call(				\
 #define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)				\
-static void ftrace_profile_##call(proto)				\
-{									\
-	extern void perf_tpcounter_event(int);				\
-	perf_tpcounter_event(event_##call.id);				\
-}									\
-									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
-{									\
-	int ret = 0;							\
-									\
-	if (!atomic_inc_return(&event_call->profile_count))		\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
-}									\
-									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
-{									\
-	if (atomic_add_negative(-1, &event_call->profile_count))	\
-		unregister_trace_##call(ftrace_profile_##call);		\
-}
 
 #define _TRACE_PROFILE_INIT(call)					\
 	.profile_count = ATOMIC_INIT(-1),				\
@@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 	.profile_disable = ftrace_profile_disable_##call,
 
 #else
-#define _TRACE_PROFILE(call, proto, args)
 #define _TRACE_PROFILE_INIT(call)
 #endif
 
@@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 673c1aa..52eb4b6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count)
 {
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
-		.addr = 0,
+		.addr = addr,
 	};
 
 	if (!data.regs)
 		data.regs = task_pt_regs(current);
 
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
-- 
cgit v0.10.2


From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Aug 2009 01:25:54 +0200
Subject: perf_counter: Fix/complete ftrace event records sampling

This patch implements the kernel side support for ftrace event
record sampling.

A new counter sampling attribute is added:

   PERF_SAMPLE_TP_RECORD

which requests ftrace events record sampling. In this case
if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint
fires, we emit the tracepoint binary record to the
perfcounter event buffer, as a sample.

Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf
record:

 perf record -f -F 1 -a -e workqueue:workqueue_execution
 perf report -D

 0x21e18 [0x48]: event: 9
 .
 . ... raw event: size 72 bytes
 .  0000:  09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff  ......H........
 .  0010:  0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00  ........!......
 .  0020:  2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e  +...........eve
 .  0030:  74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00  ts/1...........
 .  0040:  e0 b1 31 81 ff ff ff ff                          .......
.
0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33

The raw ftrace binary record starts at offset 0020.

Translation:

 struct trace_entry {
	type		= 0x2b = 43;
	flags		= 1;
	preempt_count	= 2;
	pid		= 0xa = 10;
	tgid		= 0xa = 10;
 }

 thread_comm = "events/1"
 thread_pid  = 0xa = 10;
 func	    = 0xffffffff8131b1e0 = flush_to_ldisc()

What will come next?

 - Userspace support ('perf trace'), 'flight data recorder' mode
   for perf trace, etc.

 - The unconditional copy from the profiling callback brings
   some costs however if someone wants no such sampling to
   occur, and needs to be fixed in the future. For that we need
   to have an instant access to the perf counter attribute.
   This is a matter of a flag to add in the struct ftrace_event.

 - Take care of the events recursivity! Don't ever try to record
   a lock event for example, it seems some locking is used in
   the profiling fast path and lead to a tracing recursivity.
   That will be fixed using raw spinlock or recursivity
   protection.

 - [...]

 - Profit! :-)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d7cd193..a81170d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -89,7 +89,9 @@ enum print_line_t {
 	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
 };
 
-
+void tracing_generic_entry_update(struct trace_entry *entry,
+				  unsigned long flags,
+				  int pc);
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e604e6e..a67dd5c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -121,8 +121,9 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_TP_RECORD			= 1U << 10,
 
-	PERF_SAMPLE_MAX = 1U << 10,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
 };
 
 /*
@@ -413,6 +414,11 @@ struct perf_callchain_entry {
 	__u64				ip[PERF_MAX_STACK_DEPTH];
 };
 
+struct perf_tracepoint_record {
+	int				size;
+	char				*record;
+};
+
 struct task_struct;
 
 /**
@@ -681,6 +687,7 @@ struct perf_sample_data {
 	struct pt_regs			*regs;
 	u64				addr;
 	u64				period;
+	void				*private;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index fec71f8..7fb16d9 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call(				\
 /*
  * Generate the functions needed for tracepoint perf_counter support.
  *
- * static void ftrace_profile_<call>(proto)
- * {
- * 	extern void perf_tpcounter_event(int, u64, u64);
- * 	u64 __addr = 0, __count = 1;
- *
- * 	<assign>   <-- here we expand the TP_perf_assign() macro
- *
- * 	perf_tpcounter_event(event_<call>.id, __addr, __count);
- * }
+ * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
  *
  * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
  * {
@@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call(				\
  *
  */
 
-#undef TP_fast_assign
-#define TP_fast_assign(args...)
-
-#undef TP_perf_assign
-#define TP_perf_assign(args...) args
-
-#undef __perf_addr
-#define __perf_addr(a) __addr = (a)
-
-#undef __perf_count
-#define __perf_count(c) __count = (c)
-
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
-static void ftrace_profile_##call(proto)				\
-{									\
-	extern void perf_tpcounter_event(int, u64, u64);		\
-	u64 __addr = 0, __count = 1;					\
-	{ assign; }							\
-	perf_tpcounter_event(event_##call.id, __addr, __count);		\
-}									\
+static void ftrace_profile_##call(proto);				\
 									\
 static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
 {									\
@@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TP_perf_assign
-#define TP_perf_assign(args...)
-
 #endif
 
 /*
@@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/*
+ * Define the insertion callback to profile events
+ *
+ * The job is very similar to ftrace_raw_event_<call> except that we don't
+ * insert in the ring buffer but in a perf counter.
+ *
+ * static void ftrace_profile_<call>(proto)
+ * {
+ *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
+ *	struct ftrace_event_call *event_call = &event_<call>;
+ *	extern void perf_tpcounter_event(int, u64, u64, void *, int);
+ *	struct ftrace_raw_##call *entry;
+ *	u64 __addr = 0, __count = 1;
+ *	unsigned long irq_flags;
+ *	int __entry_size;
+ *	int __data_size;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *	__entry_size = __data_size + sizeof(*entry);
+ *
+ *	do {
+ *		char raw_data[__entry_size]; <- allocate our sample in the stack
+ *		struct trace_entry *ent;
+ *
+ *		entry = (struct ftrace_raw_<call> *)raw_data;
+ *		ent = &entry->ent;
+ *		tracing_generic_entry_update(ent, irq_flags, pc);
+ *		ent->type = event_call->id;
+ *
+ *		<tstruct> <- do some jobs with dynamic arrays
+ *
+ *		<assign>  <- affect our values
+ *
+ *		perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *			     __entry_size);  <- submit them to perf counter
+ *	} while (0);
+ *
+ * }
+ */
+
+#ifdef CONFIG_EVENT_PROFILE
+
+#undef __perf_addr
+#define __perf_addr(a) __addr = (a)
+
+#undef __perf_count
+#define __perf_count(c) __count = (c)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+static void ftrace_profile_##call(proto)				\
+{									\
+	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	extern void perf_tpcounter_event(int, u64, u64, void *, int);	\
+	struct ftrace_raw_##call *entry;				\
+	u64 __addr = 0, __count = 1;					\
+	unsigned long irq_flags;					\
+	int __entry_size;						\
+	int __data_size;						\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
+	__entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
+									\
+	do {								\
+		char raw_data[__entry_size];				\
+		struct trace_entry *ent;				\
+									\
+		entry = (struct ftrace_raw_##call *)raw_data;		\
+		ent = &entry->ent;					\
+		tracing_generic_entry_update(ent, irq_flags, pc);	\
+		ent->type = event_call->id;				\
+									\
+		tstruct							\
+									\
+		{ assign; }						\
+									\
+		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+			     __entry_size);				\
+	} while (0);							\
+									\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+#endif /* CONFIG_EVENT_PROFILE */
+
 #undef _TRACE_PROFILE_INIT
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52eb4b6..8681021 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
+	struct perf_tracepoint_record *tp;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			header.size += sizeof(u64);
 	}
 
+	if (sample_type & PERF_SAMPLE_TP_RECORD) {
+		tp = data->private;
+		header.size += tp->size;
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
@@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_TP_RECORD)
+		perf_output_copy(&handle, tp->record, tp->size);
+
 	perf_output_end(&handle);
 }
 
@@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+			  int entry_size)
 {
+	struct perf_tracepoint_record tp = {
+		.size = entry_size,
+		.record = record,
+	};
+
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
 		.addr = addr,
+		.private = &tp,
 	};
 
 	if (!data.regs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8930e39..c22b40f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
 						    int type,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3548ae5..8b9f4f6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts);
 
-void tracing_generic_entry_update(struct trace_entry *entry,
-				  unsigned long flags,
-				  int pc);
-
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6da0992..90c9808 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
+
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
-- 
cgit v0.10.2


From 923c42c19944da214d697e312a040384a0e33e78 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 22 Jul 2009 20:36:03 +0200
Subject: perf_counter tools: Fix/resurrect perf top annotation in a simple
 interactive form

perf top used to have annotation support, but it has bitrotted and
removed.

This patch restores that: it allows the user to select any symbol
in kernel space for source level annotation on the fly, switch
between event counters and alter display variables. When symbol
details are being displayed, stopping annotation reverts to normal.

known keys:
        [d]     select display delay.
        [e]     select display entries (lines).
        [E]     select annotation event counter.
        [f]     select normal display count filter.
        [F]     select annotation display count filter (percentage).
        [qQ]    quit.
        [s]     select annotation symbol and start annotation.
        [S]     stop annotation, revert to normal display.
        [z]     toggle event count zeroing.

Sample:
------------------------------------------------------------------------------
   PerfTop:   16719 irqs/sec  kernel:78.7% [cache-misses/cache-references/instructions/cycles],  (all, 4 CPUs)
------------------------------------------------------------------------------

Showing cache-misses for e1000_clean_rx_irq
  Events  Pcnt (>=3%)
       0  0.0%                  /* adjust length to remove Ethernet CRC */
       0  0.0%                  if (!(adapter->flags2 & FLAG2_CRC_STRIPPING))
       0  0.0%                          length -= 4;
     436  5.0%      f039:       41 f6 84 24 5c 29 00    testb  $0x1,0x295c(%r12)
       0  0.0%      f089:       8b 4d 84                mov    -0x7c(%rbp),%ecx
       0  0.0%      f08c:       48 83 ef 02             sub    $0x2,%rdi
       0  0.0%      f090:       48 83 ee 02             sub    $0x2,%rsi
     811  9.3%      f094:       f3 a4                   rep movsb %ds:(%rsi),%es:(%rdi)
       0  0.0%
       0  0.0%          while (rx_desc->status & E1000_RXD_STAT_DD) {
       0  0.0%      f114:       41 f6 47 0c 01          testb  $0x1,0xc(%r15)
    7226 82.6%      f119:       0f 85 24 fe ff ff       jne    ef43 <e1000_clean_rx_irq+0x84>

Available events:
        0 cache-misses
        1 cache-references
        2 instructions
        3 cycles
Enter details event counter: 2
------------------------------------------------------------------------------
   PerfTop:   15035 irqs/sec  kernel:79.0% [cache-misses/cache-references/instructions/cycles],  (all, 4 CPUs)
------------------------------------------------------------------------------

Showing instructions for e1000_clean_rx_irq
  Events  Pcnt (>=3%)
       0  0.0%                                 int *work_done, int work_to_do)
       0  0.0%  {
     175  0.9%      eebf:       55                      push   %rbp
    1898  9.8%      eec0:       48 89 e5                mov    %rsp,%rbp
       0  0.0%
       0  0.0%          i = rx_ring->next_to_clean;
     140  0.7%      ef0a:       0f b7 41 1a             movzwl 0x1a(%rcx),%eax
     670  3.4%      ef0e:       89 45 ac                mov    %eax,-0x54(%rbp)
       0  0.0%  {
       0  0.0%          memcpy(skb->data + offset, from, len);
      91  0.5%      f07b:       49 8b b6 e8 00 00 00    mov    0xe8(%r14),%rsi
    1153  5.9%      f082:       48 8b b8 e8 00 00 00    mov    0xe8(%rax),%rdi
      42  0.2%      f089:       8b 4d 84                mov    -0x7c(%rbp),%ecx
      14  0.1%      f08c:       48 83 ef 02             sub    $0x2,%rdi
       0  0.0%      f090:       48 83 ee 02             sub    $0x2,%rsi
    1618  8.3%      f094:       f3 a4                   rep movsb %ds:(%rsi),%es:(%rdi)
       0  0.0%
       0  0.0%                  /* return some buffers to hardware, one at a time is too slow */
       0  0.0%                  if (cleaned_count >= E1000_RX_BUFFER_WRITE) {
     867  4.5%      f0e7:       83 7d b0 0f             cmpl   $0xf,-0x50(%rbp)
       0  0.0%
       0  0.0%          while (rx_desc->status & E1000_RXD_STAT_DD) {
      37  0.2%      f114:       41 f6 47 0c 01          testb  $0x1,0xc(%r15)
    4047 20.8%      f119:       0f 85 24 fe ff ff       jne    ef43 <e1000_clean_rx_irq+0x84>

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f139f1a..d587013 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -31,6 +31,8 @@
 #include <fcntl.h>
 
 #include <stdio.h>
+#include <termios.h>
+#include <unistd.h>
 
 #include <errno.h>
 #include <time.h>
@@ -54,7 +56,7 @@ static int			system_wide			=  0;
 
 static int			default_interval		= 100000;
 
-static u64			count_filter			=  5;
+static int			count_filter			=  5;
 static int			print_entries			= 15;
 
 static int			target_pid			= -1;
@@ -69,15 +71,27 @@ static int			freq				=  0;
 static int			verbose				=  0;
 static char			*vmlinux			=  NULL;
 
-static char			*sym_filter;
-static unsigned long		filter_start;
-static unsigned long		filter_end;
-
 static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
 /*
+ * Source
+ */
+
+struct source_line {
+	u64			eip;
+	unsigned long		count[MAX_COUNTERS];
+	char			*line;
+	struct source_line	*next;
+};
+
+static char			*sym_filter			=  NULL;
+struct sym_entry		*sym_filter_entry		=  NULL;
+static int			sym_pcnt_filter			=  5;
+static int			sym_counter			=  0;
+
+/*
  * Symbols
  */
 
@@ -91,9 +105,237 @@ struct sym_entry {
 	unsigned long		snap_count;
 	double			weight;
 	int			skip;
+	struct source_line	*source;
+	struct source_line	*lines;
+	struct source_line	**lines_tail;
+	pthread_mutex_t		source_lock;
 };
 
-struct sym_entry		*sym_filter_entry;
+/*
+ * Source functions
+ */
+
+static void parse_source(struct sym_entry *syme)
+{
+	struct symbol *sym;
+	struct module *module;
+	struct section *section = NULL;
+	FILE *file;
+	char command[PATH_MAX*2], *path = vmlinux;
+	u64 start, end, len;
+
+	if (!syme)
+		return;
+
+	if (syme->lines) {
+		pthread_mutex_lock(&syme->source_lock);
+		goto out_assign;
+	}
+
+	sym = (struct symbol *)(syme + 1);
+	module = sym->module;
+
+	if (module)
+		path = module->path;
+	if (!path)
+		return;
+
+	start = sym->obj_start;
+	if (!start)
+		start = sym->start;
+
+	if (module) {
+		section = module->sections->find_section(module->sections, ".text");
+		if (section)
+			start -= section->vma;
+	}
+
+	end = start + sym->end - sym->start + 1;
+	len = sym->end - sym->start;
+
+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", start, end, path);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	pthread_mutex_lock(&syme->source_lock);
+	syme->lines_tail = &syme->lines;
+	while (!feof(file)) {
+		struct source_line *src;
+		size_t dummy = 0;
+		char *c;
+
+		src = malloc(sizeof(struct source_line));
+		assert(src != NULL);
+		memset(src, 0, sizeof(struct source_line));
+
+		if (getline(&src->line, &dummy, file) < 0)
+			break;
+		if (!src->line)
+			break;
+
+		c = strchr(src->line, '\n');
+		if (c)
+			*c = 0;
+
+		src->next = NULL;
+		*syme->lines_tail = src;
+		syme->lines_tail = &src->next;
+
+		if (strlen(src->line)>8 && src->line[8] == ':') {
+			src->eip = strtoull(src->line, NULL, 16);
+			if (section)
+				src->eip += section->vma;
+		}
+		if (strlen(src->line)>8 && src->line[16] == ':') {
+			src->eip = strtoull(src->line, NULL, 16);
+			if (section)
+				src->eip += section->vma;
+		}
+	}
+	pclose(file);
+out_assign:
+	sym_filter_entry = syme;
+	pthread_mutex_unlock(&syme->source_lock);
+}
+
+static void __zero_source_counters(struct sym_entry *syme)
+{
+	int i;
+	struct source_line *line;
+
+	line = syme->lines;
+	while (line) {
+		for (i = 0; i < nr_counters; i++)
+			line->count[i] = 0;
+		line = line->next;
+	}
+}
+
+static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
+{
+	struct source_line *line;
+
+	if (syme != sym_filter_entry)
+		return;
+
+	if (pthread_mutex_trylock(&syme->source_lock))
+		return;
+
+	if (!syme->source)
+		goto out_unlock;
+
+	for (line = syme->lines; line; line = line->next) {
+		if (line->eip == ip) {
+			line->count[counter]++;
+			break;
+		}
+		if (line->eip > ip)
+			break;
+	}
+out_unlock:
+	pthread_mutex_unlock(&syme->source_lock);
+}
+
+static void lookup_sym_source(struct sym_entry *syme)
+{
+	struct symbol *symbol = (struct symbol *)(syme + 1);
+	struct source_line *line;
+	char pattern[PATH_MAX];
+	char *idx;
+
+	sprintf(pattern, "<%s>:", symbol->name);
+
+	if (symbol->module) {
+		idx = strstr(pattern, "\t");
+		if (idx)
+			*idx = 0;
+	}
+
+	pthread_mutex_lock(&syme->source_lock);
+	for (line = syme->lines; line; line = line->next) {
+		if (strstr(line->line, pattern)) {
+			syme->source = line;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&syme->source_lock);
+}
+
+static void show_lines(struct source_line *queue, int count, int total)
+{
+	int i;
+	struct source_line *line;
+
+	line = queue;
+	for (i = 0; i < count; i++) {
+		float pcnt = 100.0*(float)line->count[sym_counter]/(float)total;
+
+		printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line);
+		line = line->next;
+	}
+}
+
+#define TRACE_COUNT     3
+
+static void show_details(struct sym_entry *syme)
+{
+	struct symbol *symbol;
+	struct source_line *line;
+	struct source_line *line_queue = NULL;
+	int displayed = 0;
+	int line_queue_count = 0, total = 0, more = 0;
+
+	if (!syme)
+		return;
+
+	if (!syme->source)
+		lookup_sym_source(syme);
+
+	if (!syme->source)
+		return;
+
+	symbol = (struct symbol *)(syme + 1);
+	printf("Showing %s for %s\n", event_name(sym_counter), symbol->name);
+	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
+
+	pthread_mutex_lock(&syme->source_lock);
+	line = syme->source;
+	while (line) {
+		total += line->count[sym_counter];
+		line = line->next;
+	}
+
+	line = syme->source;
+	while (line) {
+		float pcnt = 0.0;
+
+		if (!line_queue_count)
+			line_queue = line;
+		line_queue_count++;
+
+		if (line->count[sym_counter])
+			pcnt = 100.0 * line->count[sym_counter] / (float)total;
+		if (pcnt >= (float)sym_pcnt_filter) {
+			if (displayed <= print_entries)
+				show_lines(line_queue, line_queue_count, total);
+			else more++;
+			displayed += line_queue_count;
+			line_queue_count = 0;
+			line_queue = NULL;
+		} else if (line_queue_count > TRACE_COUNT) {
+			line_queue = line_queue->next;
+			line_queue_count--;
+		}
+
+		line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8;
+		line = line->next;
+	}
+	pthread_mutex_unlock(&syme->source_lock);
+	if (more)
+		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
+}
 
 struct dso			*kernel_dso;
 
@@ -228,6 +470,11 @@ static void print_sym_table(void)
 
 	printf("------------------------------------------------------------------------------\n\n");
 
+	if (sym_filter_entry) {
+		show_details(sym_filter_entry);
+		return;
+	}
+
 	if (nr_counters == 1)
 		printf("             samples    pcnt");
 	else
@@ -242,7 +489,7 @@ static void print_sym_table(void)
 		struct symbol *sym = (struct symbol *)(syme + 1);
 		double pcnt;
 
-		if (++printed > print_entries || syme->snap_count < count_filter)
+		if (++printed > print_entries || (int)syme->snap_count < count_filter)
 			continue;
 
 		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
@@ -261,19 +508,208 @@ static void print_sym_table(void)
 	}
 }
 
+static void prompt_integer(int *target, const char *msg)
+{
+	char *buf = malloc(0), *p;
+	size_t dummy = 0;
+	int tmp;
+
+	fprintf(stdout, "\n%s: ", msg);
+	if (getline(&buf, &dummy, stdin) < 0)
+		return;
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = 0;
+
+	p = buf;
+	while(*p) {
+		if (!isdigit(*p))
+			goto out_free;
+		p++;
+	}
+	tmp = strtoul(buf, NULL, 10);
+	*target = tmp;
+out_free:
+	free(buf);
+}
+
+static void prompt_percent(int *target, const char *msg)
+{
+	int tmp = 0;
+
+	prompt_integer(&tmp, msg);
+	if (tmp >= 0 && tmp <= 100)
+		*target = tmp;
+}
+
+static void prompt_symbol(struct sym_entry **target, const char *msg)
+{
+	char *buf = malloc(0), *p;
+	struct sym_entry *syme = *target, *n, *found = NULL;
+	size_t dummy = 0;
+
+	/* zero counters of active symbol */
+	if (syme) {
+		pthread_mutex_lock(&syme->source_lock);
+		__zero_source_counters(syme);
+		*target = NULL;
+		pthread_mutex_unlock(&syme->source_lock);
+	}
+
+	fprintf(stdout, "\n%s: ", msg);
+	if (getline(&buf, &dummy, stdin) < 0)
+		goto out_free;
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = 0;
+
+	pthread_mutex_lock(&active_symbols_lock);
+	syme = list_entry(active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&active_symbols_lock);
+
+	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+		struct symbol *sym = (struct symbol *)(syme + 1);
+
+		if (!strcmp(buf, sym->name)) {
+			found = syme;
+			break;
+		}
+	}
+
+	if (!found) {
+		fprintf(stderr, "Sorry, %s is not active.\n", sym_filter);
+		sleep(1);
+		return;
+	} else
+		parse_source(found);
+
+out_free:
+	free(buf);
+}
+
+static void print_known_keys(void)
+{
+	fprintf(stdout, "\nknown keys:\n");
+	fprintf(stdout, "\t[d]     select display delay.\n");
+	fprintf(stdout, "\t[e]     select display entries (lines).\n");
+	fprintf(stdout, "\t[E]     select annotation event counter.\n");
+	fprintf(stdout, "\t[f]     select normal display count filter.\n");
+	fprintf(stdout, "\t[F]     select annotation display count filter (percentage).\n");
+	fprintf(stdout, "\t[qQ]    quit.\n");
+	fprintf(stdout, "\t[s]     select annotation symbol and start annotation.\n");
+	fprintf(stdout, "\t[S]     stop annotation, revert to normal display.\n");
+	fprintf(stdout, "\t[z]     toggle event count zeroing.\n");
+}
+
+static void handle_keypress(int c)
+{
+	int once = 0;
+repeat:
+	switch (c) {
+		case 'd':
+			prompt_integer(&delay_secs, "Enter display delay");
+			break;
+		case 'e':
+			prompt_integer(&print_entries, "Enter display entries (lines)");
+			break;
+		case 'E':
+			if (nr_counters > 1) {
+				int i;
+
+				fprintf(stderr, "\nAvailable events:");
+				for (i = 0; i < nr_counters; i++)
+					fprintf(stderr, "\n\t%d %s", i, event_name(i));
+
+				prompt_integer(&sym_counter, "Enter details event counter");
+
+				if (sym_counter >= nr_counters) {
+					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0));
+					sym_counter = 0;
+					sleep(1);
+				}
+			} else sym_counter = 0;
+			break;
+		case 'f':
+			prompt_integer(&count_filter, "Enter display event count filter");
+			break;
+		case 'F':
+			prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
+			break;
+		case 'q':
+		case 'Q':
+			printf("exiting.\n");
+			exit(0);
+		case 's':
+			prompt_symbol(&sym_filter_entry, "Enter details symbol");
+			break;
+		case 'S':
+			if (!sym_filter_entry)
+				break;
+			else {
+				struct sym_entry *syme = sym_filter_entry;
+
+				pthread_mutex_lock(&syme->source_lock);
+				sym_filter_entry = NULL;
+				__zero_source_counters(syme);
+				pthread_mutex_unlock(&syme->source_lock);
+			}
+			break;
+		case 'z':
+			zero = ~zero;
+			break;
+		default: {
+			struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+			struct termios tc, save;
+
+			if (!once) {
+				print_known_keys();
+				once++;
+			}
+
+			tcgetattr(0, &save);
+			tc = save;
+			tc.c_lflag &= ~(ICANON | ECHO);
+			tc.c_cc[VMIN] = 0;
+			tc.c_cc[VTIME] = 0;
+			tcsetattr(0, TCSANOW, &tc);
+
+			poll(&stdin_poll, 1, -1);
+			c = getc(stdin);
+
+			tcsetattr(0, TCSAFLUSH, &save);
+			goto repeat;
+		}
+	}
+}
+
 static void *display_thread(void *arg __used)
 {
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-	int delay_msecs = delay_secs * 1000;
-
-	printf("PerfTop refresh period: %d seconds\n", delay_secs);
+	struct termios tc, save;
+	int delay_msecs, c;
+
+	tcgetattr(0, &save);
+	tc = save;
+	tc.c_lflag &= ~(ICANON | ECHO);
+	tc.c_cc[VMIN] = 0;
+	tc.c_cc[VTIME] = 0;
+repeat:
+	delay_msecs = delay_secs * 1000;
+	tcsetattr(0, TCSANOW, &tc);
+	/* trash return*/
+	getc(stdin);
 
 	do {
 		print_sym_table();
 	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
 
-	printf("key pressed - exiting.\n");
-	exit(0);
+	c = getc(stdin);
+	tcsetattr(0, TCSAFLUSH, &save);
+
+	handle_keypress(c);
+	goto repeat;
 
 	return NULL;
 }
@@ -293,7 +729,6 @@ static const char *skip_symbols[] = {
 
 static int symbol_filter(struct dso *self, struct symbol *sym)
 {
-	static int filter_match;
 	struct sym_entry *syme;
 	const char *name = sym->name;
 	int i;
@@ -315,6 +750,10 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
 		return 1;
 
 	syme = dso__sym_priv(self, sym);
+	pthread_mutex_init(&syme->source_lock, NULL);
+	if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter))
+		sym_filter_entry = syme;
+
 	for (i = 0; skip_symbols[i]; i++) {
 		if (!strcmp(skip_symbols[i], name)) {
 			syme->skip = 1;
@@ -322,29 +761,6 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
 		}
 	}
 
-	if (filter_match == 1) {
-		filter_end = sym->start;
-		filter_match = -1;
-		if (filter_end - filter_start > 10000) {
-			fprintf(stderr,
-				"hm, too large filter symbol <%s> - skipping.\n",
-				sym_filter);
-			fprintf(stderr, "symbol filter start: %016lx\n",
-				filter_start);
-			fprintf(stderr, "                end: %016lx\n",
-				filter_end);
-			filter_end = filter_start = 0;
-			sym_filter = NULL;
-			sleep(1);
-		}
-	}
-
-	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
-		filter_match = 1;
-		filter_start = sym->start;
-	}
-
-
 	return 0;
 }
 
@@ -380,8 +796,6 @@ out_delete_dso:
 	return -1;
 }
 
-#define TRACE_COUNT     3
-
 /*
  * Binary search in the histogram table and record the hit:
  */
@@ -394,6 +808,7 @@ static void record_ip(u64 ip, int counter)
 
 		if (!syme->skip) {
 			syme->count[counter]++;
+			record_precise_ip(syme, counter, ip);
 			pthread_mutex_lock(&active_symbols_lock);
 			if (list_empty(&syme->node) || !syme->node.next)
 				__list_insert_active_sym(syme);
@@ -690,8 +1105,8 @@ static const struct option options[] = {
 			    "put the counters into a counter group"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
-	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
-		    "only display symbols matchig this pattern"),
+	OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
+		    "symbol to annotate - requires -k option"),
 	OPT_BOOLEAN('z', "zero", &zero,
 		    "zero history across updates"),
 	OPT_INTEGER('F', "freq", &freq,
@@ -734,6 +1149,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		delay_secs = 1;
 
 	parse_symbols();
+	parse_source(sym_filter_entry);
 
 	/*
 	 * Fill in the ones not specifically initialized via -c:
-- 
cgit v0.10.2


From 46ab976443c6c566c8fe6fc72a6733a55ba9fbea Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 24 Jul 2009 10:09:50 +0200
Subject: perf_counter tools: Allow perf top top users to switch between
 weighted and individual counter display

Add [w]eighted hotkey.  Pressing [w] toggles between displaying
weighted total of all counters, and the counter selected via
[E]vent select key.

------------------------------------------------------------------------------
   PerfTop:   90395 irqs/sec  kernel:16.1% [cache-misses/cache-references/instructions],  (all, 4 CPUs)
------------------------------------------------------------------------------

  weight     samples    pcnt         RIP          kernel function
  ______     _______   _____   ________________   _______________

1275408.6      10881 -  5.3% - ffffffff81146f70 : copy_page_c
 553683.4      43569 - 21.3% - ffffffff81146f20 : clear_page_c
  74075.0       6768 -  3.3% - ffffffff81147190 : copy_user_generic_string
  40602.9       7538 -  3.7% - ffffffff81284ba2 : _spin_lock
  26882.1        965 -  0.5% - ffffffff8109d280 : file_ra_state_init

[w]

------------------------------------------------------------------------------
   PerfTop:   91221 irqs/sec  kernel:14.5% [10000Hz cache-misses],  (all, 4 CPUs)
------------------------------------------------------------------------------

  weight     samples    pcnt         RIP          kernel function
  ______     _______   _____   ________________   _______________

            47320.00 - 22.3% - ffffffff81146f20 : clear_page_c
            14261.00 -  6.7% - ffffffff810992f5 : __rmqueue
            11046.00 -  5.2% - ffffffff81146f70 : copy_page_c
             7842.00 -  3.7% - ffffffff81284ba2 : _spin_lock
             7234.00 -  3.4% - ffffffff810aa1d6 : unmap_vmas

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d587013..4eef346 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -90,6 +90,7 @@ static char			*sym_filter			=  NULL;
 struct sym_entry		*sym_filter_entry		=  NULL;
 static int			sym_pcnt_filter			=  5;
 static int			sym_counter			=  0;
+static int			display_weighted		= -1;
 
 /*
  * Symbols
@@ -354,6 +355,9 @@ static double sym_weight(const struct sym_entry *sym)
 	double weight = sym->snap_count;
 	int counter;
 
+	if (!display_weighted)
+		return weight;
+
 	for (counter = 1; counter < nr_counters-1; counter++)
 		weight *= sym->count[counter];
 
@@ -401,7 +405,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 static void print_sym_table(void)
 {
 	int printed = 0, j;
-	int counter;
+	int counter, snap = !display_weighted ? sym_counter : 0;
 	float samples_per_sec = samples/delay_secs;
 	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
 	float sum_ksamples = 0.0;
@@ -417,7 +421,7 @@ static void print_sym_table(void)
 	pthread_mutex_unlock(&active_symbols_lock);
 
 	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-		syme->snap_count = syme->count[0];
+		syme->snap_count = syme->count[snap];
 		if (syme->snap_count != 0) {
 			syme->weight = sym_weight(syme);
 			rb_insert_active_sym(&tmp, syme);
@@ -437,7 +441,7 @@ static void print_sym_table(void)
 		samples_per_sec,
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
-	if (nr_counters == 1) {
+	if (nr_counters == 1 || !display_weighted) {
 		printf("%Ld", (u64)attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
@@ -445,7 +449,9 @@ static void print_sym_table(void)
 			printf(" ");
 	}
 
-	for (counter = 0; counter < nr_counters; counter++) {
+	if (!display_weighted)
+		printf("%s", event_name(sym_counter));
+	else for (counter = 0; counter < nr_counters; counter++) {
 		if (counter)
 			printf("/");
 
@@ -495,7 +501,7 @@ static void print_sym_table(void)
 		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
 					 sum_ksamples));
 
-		if (nr_counters == 1)
+		if (nr_counters == 1 || !display_weighted)
 			printf("%20.2f - ", syme->weight);
 		else
 			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
@@ -594,13 +600,14 @@ static void print_known_keys(void)
 	fprintf(stdout, "\nknown keys:\n");
 	fprintf(stdout, "\t[d]     select display delay.\n");
 	fprintf(stdout, "\t[e]     select display entries (lines).\n");
-	fprintf(stdout, "\t[E]     select annotation event counter.\n");
+	fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_counter));
 	fprintf(stdout, "\t[f]     select normal display count filter.\n");
 	fprintf(stdout, "\t[F]     select annotation display count filter (percentage).\n");
 	fprintf(stdout, "\t[qQ]    quit.\n");
 	fprintf(stdout, "\t[s]     select annotation symbol and start annotation.\n");
 	fprintf(stdout, "\t[S]     stop annotation, revert to normal display.\n");
-	fprintf(stdout, "\t[z]     toggle event count zeroing.\n");
+	fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
+	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", zero ? 1 : 0);
 }
 
 static void handle_keypress(int c)
@@ -656,6 +663,9 @@ repeat:
 				pthread_mutex_unlock(&syme->source_lock);
 			}
 			break;
+		case 'w':
+			display_weighted = ~display_weighted;
+			break;
 		case 'z':
 			zero = ~zero;
 			break;
-- 
cgit v0.10.2


From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 09:29:32 +0200
Subject: perf_counter: Fix software counters for fast moving event sources

Reimplement the software counters to deal with fast moving
event sources (such as tracepoints). This means being able
to generate multiple overflows from a single 'event' as well
as support throttling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8681021..615440a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
  * Generic software counter infrastructure
  */
 
-static void perf_swcounter_update(struct perf_counter *counter)
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
-	u64 prev, now;
-	s64 delta;
+	u64 period = hwc->last_period;
+	u64 nr, offset;
+	s64 old, val;
+
+	hwc->last_period = hwc->sample_period;
 
 again:
-	prev = atomic64_read(&hwc->prev_count);
-	now = atomic64_read(&hwc->count);
-	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
-		goto again;
+	old = val = atomic64_read(&hwc->period_left);
+	if (val < 0)
+		return 0;
 
-	delta = now - prev;
+	nr = div64_u64(period + val, period);
+	offset = nr * period;
+	val -= offset;
+	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+		goto again;
 
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
+	return nr;
 }
 
-static void perf_swcounter_set_period(struct perf_counter *counter)
+static void perf_swcounter_overflow(struct perf_counter *counter,
+				    int nmi, struct perf_sample_data *data)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
+	u64 overflow;
 
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-	}
+	data->period = counter->hw.last_period;
+	overflow = perf_swcounter_set_period(counter);
 
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_add(period, &hwc->period_left);
-		hwc->last_period = period;
-	}
+	if (hwc->interrupts == MAX_INTERRUPTS)
+		return;
 
-	atomic64_set(&hwc->prev_count, -left);
-	atomic64_set(&hwc->count, -left);
+	for (; overflow; overflow--) {
+		if (perf_counter_overflow(counter, nmi, data)) {
+			/*
+			 * We inhibit the overflow from happening when
+			 * hwc->interrupts == MAX_INTERRUPTS.
+			 */
+			break;
+		}
+	}
 }
 
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
 {
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct perf_counter *counter;
-	u64 period;
-
-	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->pmu->read(counter);
-
-	data.addr = 0;
-	data.regs = get_irq_regs();
 	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
+	 * Nothing to do, we already reset hwc->interrupts.
 	 */
-	if ((counter->attr.exclude_kernel || !data.regs) &&
-			!counter->attr.exclude_user)
-		data.regs = task_pt_regs(current);
+}
 
-	if (data.regs) {
-		if (perf_counter_overflow(counter, 0, &data))
-			ret = HRTIMER_NORESTART;
-	}
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+			       int nmi, struct perf_sample_data *data)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
 
-	period = max_t(u64, 10000, counter->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+	atomic64_add(nr, &counter->count);
 
-	return ret;
-}
+	if (!hwc->sample_period)
+		return;
 
-static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data)
-{
-	data->period = counter->hw.last_period;
+	if (!data->regs)
+		return;
 
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, data))
-		/* soft-disable the counter */
-		;
+	if (!atomic64_add_negative(nr, &hwc->period_left))
+		perf_swcounter_overflow(counter, nmi, data);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	return 1;
 }
 
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data)
-{
-	int neg = atomic64_add_negative(nr, &counter->hw.count);
-
-	if (counter->hw.sample_period && !neg && data->regs)
-		perf_swcounter_overflow(counter, nmi, data);
-}
-
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
 				     u32 event, u64 nr, int nmi,
@@ -3575,27 +3560,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 
 static void perf_swcounter_read(struct perf_counter *counter)
 {
-	perf_swcounter_update(counter);
 }
 
 static int perf_swcounter_enable(struct perf_counter *counter)
 {
-	perf_swcounter_set_period(counter);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (hwc->sample_period) {
+		hwc->last_period = hwc->sample_period;
+		perf_swcounter_set_period(counter);
+	}
 	return 0;
 }
 
 static void perf_swcounter_disable(struct perf_counter *counter)
 {
-	perf_swcounter_update(counter);
 }
 
 static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
+	.unthrottle	= perf_swcounter_unthrottle,
 };
 
 /*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct perf_counter *counter;
+	u64 period;
+
+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
+	counter->pmu->read(counter);
+
+	data.addr = 0;
+	data.regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((counter->attr.exclude_kernel || !data.regs) &&
+			!counter->attr.exclude_user)
+		data.regs = task_pt_regs(current);
+
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
+			ret = HRTIMER_NORESTART;
+	}
+
+	period = max_t(u64, 10000, counter->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return ret;
+}
+
+/*
  * Software counter: cpu wall time clock
  */
 
-- 
cgit v0.10.2


From 091bd2e993fcb1094a23e36157285b62bc87afdf Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 4 Aug 2009 10:21:23 +0200
Subject: perf top: Improve interactive key handling

Pressing any key which is not currently mapped to
functionality, based on startup command line options, displays
currently mapped keys, and prompts for input.

Pressing any unmapped key at the prompt returns the user to
display mode with variables unchanged.  eg, pressing ? <SPACE>
<ESC> etc displays currently available keys, the value of the
variable associated with that key, and prompts.

Pressing same again aborts input.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4eef346..7de28ce 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -595,25 +595,84 @@ out_free:
 	free(buf);
 }
 
-static void print_known_keys(void)
+static void print_mapped_keys(void)
 {
-	fprintf(stdout, "\nknown keys:\n");
-	fprintf(stdout, "\t[d]     select display delay.\n");
-	fprintf(stdout, "\t[e]     select display entries (lines).\n");
-	fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_counter));
-	fprintf(stdout, "\t[f]     select normal display count filter.\n");
-	fprintf(stdout, "\t[F]     select annotation display count filter (percentage).\n");
-	fprintf(stdout, "\t[qQ]    quit.\n");
-	fprintf(stdout, "\t[s]     select annotation symbol and start annotation.\n");
-	fprintf(stdout, "\t[S]     stop annotation, revert to normal display.\n");
-	fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
+	char *name = NULL;
+
+	if (sym_filter_entry) {
+		struct symbol *sym = (struct symbol *)(sym_filter_entry+1);
+		name = sym->name;
+	}
+
+	fprintf(stdout, "\nMapped keys:\n");
+	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", delay_secs);
+	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
+
+	if (nr_counters > 1)
+		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_counter));
+
+	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
+
+	if (vmlinux) {
+		fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
+		fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
+		fprintf(stdout, "\t[S]     stop annotation.\n");
+	}
+
+	if (nr_counters > 1)
+		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
+
 	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", zero ? 1 : 0);
+	fprintf(stdout, "\t[qQ]    quit.\n");
+}
+
+static int key_mapped(int c)
+{
+	switch (c) {
+		case 'd':
+		case 'e':
+		case 'f':
+		case 'z':
+		case 'q':
+		case 'Q':
+			return 1;
+		case 'E':
+		case 'w':
+			return nr_counters > 1 ? 1 : 0;
+		case 'F':
+		case 's':
+		case 'S':
+			return vmlinux ? 1 : 0;
+	}
+
+	return 0;
 }
 
 static void handle_keypress(int c)
 {
-	int once = 0;
-repeat:
+	if (!key_mapped(c)) {
+		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+		struct termios tc, save;
+
+		print_mapped_keys();
+		fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
+		fflush(stdout);
+
+		tcgetattr(0, &save);
+		tc = save;
+		tc.c_lflag &= ~(ICANON | ECHO);
+		tc.c_cc[VMIN] = 0;
+		tc.c_cc[VTIME] = 0;
+		tcsetattr(0, TCSANOW, &tc);
+
+		poll(&stdin_poll, 1, -1);
+		c = getc(stdin);
+
+		tcsetattr(0, TCSAFLUSH, &save);
+		if (!key_mapped(c))
+			return;
+	}
+
 	switch (c) {
 		case 'd':
 			prompt_integer(&delay_secs, "Enter display delay");
@@ -669,28 +728,6 @@ repeat:
 		case 'z':
 			zero = ~zero;
 			break;
-		default: {
-			struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-			struct termios tc, save;
-
-			if (!once) {
-				print_known_keys();
-				once++;
-			}
-
-			tcgetattr(0, &save);
-			tc = save;
-			tc.c_lflag &= ~(ICANON | ECHO);
-			tc.c_cc[VMIN] = 0;
-			tc.c_cc[VTIME] = 0;
-			tcsetattr(0, TCSANOW, &tc);
-
-			poll(&stdin_poll, 1, -1);
-			c = getc(stdin);
-
-			tcsetattr(0, TCSAFLUSH, &save);
-			goto repeat;
-		}
 	}
 }
 
@@ -705,6 +742,7 @@ static void *display_thread(void *arg __used)
 	tc.c_lflag &= ~(ICANON | ECHO);
 	tc.c_cc[VMIN] = 0;
 	tc.c_cc[VTIME] = 0;
+
 repeat:
 	delay_msecs = delay_secs * 1000;
 	tcsetattr(0, TCSANOW, &tc);
-- 
cgit v0.10.2


From 836179834833272f89098c6d1e1b89e8e69797c2 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 4 Aug 2009 10:24:41 +0200
Subject: perf top: Update man page

perf_counter tools: update perf top manual page to reflect
current implementation.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 539d012..4a7d558 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -3,36 +3,122 @@ perf-top(1)
 
 NAME
 ----
-perf-top - Run a command and profile it
+perf-top - System profiling tool.
 
 SYNOPSIS
 --------
 [verse]
-'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf top' [-e <EVENT> | --event=EVENT] [<options>]
 
 DESCRIPTION
 -----------
-This command runs a command and gathers a performance counter profile
-from it.
+This command generates and displays a performance counter profile in realtime.
 
 
 OPTIONS
 -------
-<command>...::
-	Any command you can specify in a shell.
+-a::
+--all-cpus::
+        System-wide collection.  (default)
+
+-c <count>::
+--count=<count>::
+	Event period to sample.
+
+-C <cpu>::
+--CPU=<cpu>::
+	CPU to profile.
+
+-d <seconds>::
+--delay=<seconds>::
+	Number of seconds to delay between refreshes.
 
--e::
---event=::
+-e <event>::
+--event=<event>::
 	Select the PMU event. Selection can be a symbolic event name
 	(use 'perf list' to list all events) or a raw PMU
 	event (eventsel+umask) in the form of rNNN where NNN is a
-	 hexadecimal event descriptor.
+	hexadecimal event descriptor.
 
--a::
-        system-wide collection
+-E <entries>::
+--entries=<entries>::
+	Display this many functions.
+
+-f <count>::
+--count-filter=<count>::
+	Only display functions with more events than this.
+
+-F <freq>::
+--freq=<freq>::
+	Profile at this frequency.
+
+-i::
+--inherit::
+	Child tasks inherit counters, only makes sens with -p option.
+
+-k <path>::
+--vmlinux=<path>::
+	Path to vmlinux.  Required for annotation functionality.
+
+-m <pages>::
+--mmap-pages=<pages>::
+	Number of mmapped data pages.
+
+-p <pid>::
+--pid=<pid>::
+	Profile events on existing pid.
+
+-r <priority>::
+--realtime=<priority>::
+	Collect data with this RT SCHED_FIFO priority.
+
+-s <symbol>::
+--sym-annotate=<symbol>::
+        Annotate this symbol.  Requires -k option.
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+-z::
+--zero::
+	Zero history across display updates.
+
+INTERACTIVE PROMPTING KEYS
+--------------------------
+
+[d]::
+	Display refresh delay.
+
+[e]::
+	Number of entries to display.
+
+[E]::
+	Event to display when multiple counters are active.
+
+[f]::
+	Profile display filter (>= hit count).
+
+[F]::
+	Annotation display filter (>= % of total).
+
+[s]::
+	Annotate symbol.
+
+[S]::
+	Stop annotation, return to full profile display.
+
+[w]::
+	Toggle between weighted sum and individual count[E]r profile.
+
+[z]::
+	Toggle event count zeroing across display updates.
+
+[qQ]::
+	Quit.
+
+Pressing any unmapped key displays a menu, and prompts for input.
 
--l::
-        scale counter values
 
 SEE ALSO
 --------
-- 
cgit v0.10.2


From 1953287bfe8afcbbd235bd6c42c9df06d52438dc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Aug 2009 07:11:05 +0200
Subject: perf tools: Fix call-chain cumul hit based sub-total (fractal mode)

The callchain fractal mode builds each new total hits in a new
branch of profiling by using the parent's hits of the current
branch plus the hits of the children.

This is wrong, the total hits of a branch should be made of the
sum of every children hits, we must ignore the parent hits in
this scope.

This patch also fixes another mistake with the hit counting.

Now the rates are correct.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8cb58d6..da402e1 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -901,7 +901,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	int i;
 
 	if (callchain_param.mode == CHAIN_GRAPH_REL)
-		new_total = self->cumul_hit;
+		new_total = self->children_hit;
 	else
 		new_total = total_samples;
 
@@ -930,7 +930,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 			ret += ipchain__fprintf_graph(fp, chain, depth,
 						      new_depth_mask, i++,
 						      new_total,
-						      child->cumul_hit);
+						      cumul_hits(child));
 		}
 		ret += callchain__fprintf_graph(fp, child, new_total,
 						depth + 1,
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 9d3c814..98c5627 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -26,10 +26,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct callchain_node *rnode;
+	u64 chain_cumul = cumul_hits(chain);
 
 	while (*p) {
+		u64 rnode_cumul;
+
 		parent = *p;
 		rnode = rb_entry(parent, struct callchain_node, rb_node);
+		rnode_cumul = cumul_hits(rnode);
 
 		switch (mode) {
 		case CHAIN_FLAT:
@@ -40,7 +44,7 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 			break;
 		case CHAIN_GRAPH_ABS: /* Falldown */
 		case CHAIN_GRAPH_REL:
-			if (rnode->cumul_hit < chain->cumul_hit)
+			if (rnode_cumul < chain_cumul)
 				p = &(*p)->rb_left;
 			else
 				p = &(*p)->rb_right;
@@ -87,7 +91,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_abs(child, min_hit);
-		if (child->cumul_hit >= min_hit)
+		if (cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_ABS);
 	}
@@ -108,11 +112,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
 	u64 min_hit;
 
 	node->rb_root = RB_ROOT;
-	min_hit = node->cumul_hit * min_percent / 100.0;
+	min_hit = node->children_hit * min_percent / 100.0;
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_rel(child, min_percent);
-		if (child->cumul_hit >= min_hit)
+		if (cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_REL);
 	}
@@ -211,7 +215,8 @@ add_child(struct callchain_node *parent, struct ip_callchain *chain,
 	new = create_child(parent, false);
 	fill_node(new, chain, start, syms);
 
-	new->cumul_hit = new->hit = 1;
+	new->children_hit = 0;
+	new->hit = 1;
 }
 
 /*
@@ -241,7 +246,8 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain,
 
 	/* split the hits */
 	new->hit = parent->hit;
-	new->cumul_hit = parent->cumul_hit;
+	new->children_hit = parent->children_hit;
+	parent->children_hit = cumul_hits(new);
 	new->val_nr = parent->val_nr - idx_local;
 	parent->val_nr = idx_local;
 
@@ -249,6 +255,7 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain,
 	if (idx_total < chain->nr) {
 		parent->hit = 0;
 		add_child(parent, chain, idx_total, syms);
+		parent->children_hit++;
 	} else {
 		parent->hit = 1;
 	}
@@ -269,13 +276,13 @@ __append_chain_children(struct callchain_node *root, struct ip_callchain *chain,
 		unsigned int ret = __append_chain(rnode, chain, start, syms);
 
 		if (!ret)
-			goto cumul;
+			goto inc_children_hit;
 	}
 	/* nothing in children, add to the current node */
 	add_child(root, chain, start, syms);
 
-cumul:
-	root->cumul_hit++;
+inc_children_hit:
+	root->children_hit++;
 }
 
 static int
@@ -317,8 +324,6 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain,
 	/* we match 100% of the path, increment the hit */
 	if (i - start == root->val_nr && i == chain->nr) {
 		root->hit++;
-		root->cumul_hit++;
-
 		return 0;
 	}
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 7812122..b2d128e 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -21,7 +21,7 @@ struct callchain_node {
 	struct rb_root		rb_root; /* sorted tree of children */
 	unsigned int		val_nr;
 	u64			hit;
-	u64			cumul_hit; /* hit + hits of children */
+	u64			children_hit;
 };
 
 struct callchain_param;
@@ -48,6 +48,11 @@ static inline void callchain_init(struct callchain_node *node)
 	INIT_LIST_HEAD(&node->val);
 }
 
+static inline u64 cumul_hits(struct callchain_node *node)
+{
+	return node->hit + node->children_hit;
+}
+
 int register_callchain_param(struct callchain_param *param);
 void append_chain(struct callchain_node *root, struct ip_callchain *chain,
 		  struct symbol **syms);
-- 
cgit v0.10.2


From 1c222bce7dd0cb9578b4c5cd3874a74f1db497c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 6 Aug 2009 20:57:41 +0200
Subject: perf tools: Fix multi-counter stat bug caused by incorrect reading of
 perf.data file header

Brice Goglin reported that only the first result from a
multi-counter perf record --stat run is accurate, the
rest looks bogus.

A silly mistake made us re-read the first attribute for
every recorded attribute.

Reported-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Brice Goglin <Brice.Goglin@inria.fr>
Cc: paulus@samba.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 450384b..95a44bc 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -213,9 +213,10 @@ struct perf_header *perf_header__read(int fd)
 
 	for (i = 0; i < nr_attrs; i++) {
 		struct perf_header_attr *attr;
-		off_t tmp = lseek(fd, 0, SEEK_CUR);
+		off_t tmp;
 
 		do_read(fd, &f_attr, sizeof(f_attr));
+		tmp = lseek(fd, 0, SEEK_CUR);
 
 		attr = perf_header_attr__new(&f_attr.attr);
 
-- 
cgit v0.10.2


From 8f18aec535b5ca513dd13b531730177d35175ffa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 6 Aug 2009 19:40:28 +0200
Subject: perf report: Fix per task mult-counter stat reporting

Brice Goglin reported:

> I can easily sort them by thread id, but I don't know how to match
> my 4 events with each group of 4 lines.

Also report the counter id and the time running/enabled
stats (in case the counter got time-shared).

Reported-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index da402e1..8420546 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -112,7 +112,9 @@ struct read_event {
 	struct perf_event_header header;
 	u32 pid,tid;
 	u64 value;
-	u64 format[3];
+	u64 time_enabled;
+	u64 time_running;
+	u64 id;
 };
 
 typedef union event_union {
@@ -1690,14 +1692,37 @@ static void trace_event(event_t *event)
 	dprintf(".\n");
 }
 
+static struct perf_header	*header;
+
+static struct perf_counter_attr *perf_header__find_attr(u64 id)
+{
+	int i;
+
+	for (i = 0; i < header->attrs; i++) {
+		struct perf_header_attr *attr = header->attr[i];
+		int j;
+
+		for (j = 0; j < attr->ids; j++) {
+			if (attr->id[j] == id)
+				return &attr->attr;
+		}
+	}
+
+	return NULL;
+}
+
 static int
 process_read_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	dprintf("%p [%p]: PERF_EVENT_READ: %d %d %Lu\n",
+	struct perf_counter_attr *attr = perf_header__find_attr(event->read.id);
+
+	dprintf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
 			event->read.pid,
 			event->read.tid,
+			attr ? __event_name(attr->type, attr->config)
+			     : "FAIL",
 			event->read.value);
 
 	return 0;
@@ -1743,8 +1768,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static struct perf_header	*header;
-
 static u64 perf_header__sample_type(void)
 {
 	u64 sample_type = 0;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 7bdad8d..f77407b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -223,9 +223,15 @@ char *event_name(int counter)
 {
 	u64 config = attrs[counter].config;
 	int type = attrs[counter].type;
+
+	return __event_name(type, config);
+}
+
+char *__event_name(int type, u64 config)
+{
 	static char buf[32];
 
-	if (attrs[counter].type == PERF_TYPE_RAW) {
+	if (type == PERF_TYPE_RAW) {
 		sprintf(buf, "raw 0x%llx", config);
 		return buf;
 	}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 1ea5d09..192a962 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -10,6 +10,7 @@ extern int			nr_counters;
 extern struct perf_counter_attr attrs[MAX_COUNTERS];
 
 extern char *event_name(int ctr);
+extern char *__event_name(int type, u64 config);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
 
-- 
cgit v0.10.2


From 94cb9e385d5b4d55a5ae389baa10ad2835ea39bb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 6 Aug 2009 14:43:17 -0300
Subject: perf report: Add debug help for the finding of symbol bugs - show the
 symtab origin (DSO, build-id, kernel, etc)

Used with perf report --verbose:

[acme@doppio linux-2.6-tip]$ perf report -v | head -16
     5.17%  firefox  /usr/lib64/xulrunner-1.9.1/libxul.so   0x00000000005d8eee f [.] imgContainer::DrawFrameTo(gfxIImageFrame*, gfxIImageFrame*, nsRect&)
     2.56%  firefox  /lib64/libpthread-2.10.1.so            0x0000000000008e02 d [.] __pthread_mutex_lock_internal
     1.94%  firefox  /usr/lib64/xulrunner-1.9.1/libxul.so   0x0000000000d0af8f f [.] SearchTable
     1.75%  firefox  [kernel]                               0xffffffffff60013b k [.] vread_hpet
     1.63%  firefox  /lib64/libpthread-2.10.1.so            0x000000000000a404 d [.] __pthread_mutex_unlock
     1.47%  firefox  /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000482ea f [.] js_Interpret
     1.42%  firefox  /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x000000000003eda3 f [.] JS_CallTracer
     1.24%  firefox  [kernel]                               0xffffffff8102ca4a k [k] read_hpet
     1.16%  firefox  [kernel]                               0xffffffff810f3dd4 k [k] fget_light
     1.11%  firefox  /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000567ff f [.] js_TraceObject
     0.98%  firefox  /usr/lib64/firefox-3.5.2/firefox       0x000000000000dd23 b [.] arena_ralloc
[acme@doppio linux-2.6-tip]$

The new field is just after the symbol address. To help in
figuring out symbol resolution bugs.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8420546..a5e2f8d 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -700,7 +700,8 @@ sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += repsep_fprintf(fp, "%#018llx  ", (u64)self->ip);
+		ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip,
+				      dso__symtab_origin(self->dso));
 
 	ret += repsep_fprintf(fp, "[%c] ", self->level);
 	if (self->sym) {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 16ddca2..f1dcede 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -24,6 +24,16 @@ const char *sym_hist_filter;
 #define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
 #endif
 
+enum dso_origin {
+	DSO__ORIG_KERNEL = 0,
+	DSO__ORIG_JAVA_JIT,
+	DSO__ORIG_FEDORA,
+	DSO__ORIG_UBUNTU,
+	DSO__ORIG_BUILDID,
+	DSO__ORIG_DSO,
+	DSO__ORIG_NOT_FOUND,
+};
+
 static struct symbol *symbol__new(u64 start, u64 len,
 				  const char *name, unsigned int priv_size,
 				  u64 obj_start, int verbose)
@@ -81,6 +91,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 		self->sym_priv_size = sym_priv_size;
 		self->find_symbol = dso__find_symbol;
 		self->slen_calculated = 0;
+		self->origin = DSO__ORIG_NOT_FOUND;
 	}
 
 	return self;
@@ -710,7 +721,7 @@ static char *dso__read_build_id(struct dso *self, int verbose)
 		++raw;
 		bid += 2;
 	}
-	if (verbose)
+	if (verbose >= 2)
 		printf("%s(%s): %s\n", __func__, self->name, build_id);
 out_elf_end:
 	elf_end(elf);
@@ -720,11 +731,26 @@ out:
 	return build_id;
 }
 
+char dso__symtab_origin(const struct dso *self)
+{
+	static const char origin[] = {
+		[DSO__ORIG_KERNEL] =   'k',
+		[DSO__ORIG_JAVA_JIT] = 'j',
+		[DSO__ORIG_FEDORA] =   'f',
+		[DSO__ORIG_UBUNTU] =   'u',
+		[DSO__ORIG_BUILDID] =  'b',
+		[DSO__ORIG_DSO] =      'd',
+	};
+
+	if (self == NULL || self->origin == DSO__ORIG_NOT_FOUND)
+		return '!';
+	return origin[self->origin];
+}
+
 int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 {
 	int size = PATH_MAX;
 	char *name = malloc(size), *build_id = NULL;
-	int variant = 0;
 	int ret = -1;
 	int fd;
 
@@ -733,19 +759,26 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 
 	self->adjust_symbols = 0;
 
-	if (strncmp(self->name, "/tmp/perf-", 10) == 0)
-		return dso__load_perf_map(self, filter, verbose);
+	if (strncmp(self->name, "/tmp/perf-", 10) == 0) {
+		ret = dso__load_perf_map(self, filter, verbose);
+		self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT :
+					 DSO__ORIG_NOT_FOUND;
+		return ret;
+	}
+
+	self->origin = DSO__ORIG_FEDORA - 1;
 
 more:
 	do {
-		switch (variant) {
-		case 0: /* Fedora */
+		self->origin++;
+		switch (self->origin) {
+		case DSO__ORIG_FEDORA:
 			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
 			break;
-		case 1: /* Ubuntu */
+		case DSO__ORIG_UBUNTU:
 			snprintf(name, size, "/usr/lib/debug%s", self->name);
 			break;
-		case 2:
+		case DSO__ORIG_BUILDID:
 			build_id = dso__read_build_id(self, verbose);
 			if (build_id != NULL) {
 				snprintf(name, size,
@@ -754,16 +787,15 @@ more:
 				free(build_id);
 				break;
 			}
-			variant++;
+			self->origin++;
 			/* Fall thru */
-		case 3: /* Sane people */
+		case DSO__ORIG_DSO:
 			snprintf(name, size, "%s", self->name);
 			break;
 
 		default:
 			goto out;
 		}
-		variant++;
 
 		fd = open(name, O_RDONLY);
 	} while (fd < 0);
@@ -899,6 +931,9 @@ int dso__load_kernel(struct dso *self, const char *vmlinux,
 	if (err <= 0)
 		err = dso__load_kallsyms(self, filter, verbose);
 
+	if (err > 0)
+		self->origin = DSO__ORIG_KERNEL;
+
 	return err;
 }
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 2f92b21..1e003ec 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -26,6 +26,7 @@ struct dso {
 	unsigned int	 sym_priv_size;
 	unsigned char	 adjust_symbols;
 	unsigned char	 slen_calculated;
+	unsigned char	 origin;
 	char		 name[0];
 };
 
@@ -49,6 +50,7 @@ int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose);
 int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
+char dso__symtab_origin(const struct dso *self);
 
 void symbol__init(void);
 #endif /* _PERF_SYMBOL_ */
-- 
cgit v0.10.2


From b26bc5a7f81474937e427b0c855eabee5ad56f89 Mon Sep 17 00:00:00 2001
From: Brice Goglin <Brice.Goglin@inria.fr>
Date: Fri, 7 Aug 2009 10:18:39 +0200
Subject: perf stat: Fix tool option consistency: rename -S/--scale to
 -c/--scale

We want to use a coherent flag for -S/--stat across all tools,
so free up -S in perf stat.

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 0d74346..484080d 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -40,7 +40,7 @@ OPTIONS
 -a::
         system-wide collection
 
--S::
+-c::
         scale counter values
 
 EXAMPLES
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f9510ee..b4b06c7 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -496,7 +496,7 @@ static const struct option options[] = {
 		    "stat events on existing pid"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('S', "scale", &scale,
+	OPT_BOOLEAN('c', "scale", &scale,
 		    "scale/normalize counters"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
-- 
cgit v0.10.2


From f36a1a133a947973efb8e6a1fbdcc23e4a011437 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 7 Aug 2009 16:59:45 +1000
Subject: perf_counter/powerpc: Fix oops on cpus without perf_counter hardware
 support

If we have the powerpc perf_counter backend compiled in, but
the cpu we are running on is one where we don't support the
PMU, we currently oops in hw_perf_group_sched_in if we try to
use any counters, because ppmu is NULL in that case, and we
unconditionally dereference ppmu.

This fixes the problem by adding a check if ppmu is NULL at the
beginning of hw_perf_group_sched_in, and also at the beginning
of the other functions that get called from the perf_counter
core, i.e. hw_perf_disable, hw_perf_enable, and
hw_perf_counter_setup.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: benh@kernel.crashing.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 809fdf9..70e1f57 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -518,6 +518,8 @@ void hw_perf_disable(void)
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
 
+	if (!ppmu)
+		return;
 	local_irq_save(flags);
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 
@@ -572,6 +574,8 @@ void hw_perf_enable(void)
 	int n_lim;
 	int idx;
 
+	if (!ppmu)
+		return;
 	local_irq_save(flags);
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	if (!cpuhw->disabled) {
@@ -737,6 +741,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	long i, n, n0;
 	struct perf_counter *sub;
 
+	if (!ppmu)
+		return 0;
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	n0 = cpuhw->n_counters;
 	n = collect_events(group_leader, ppmu->n_counter - n0,
@@ -1281,6 +1287,8 @@ void hw_perf_counter_setup(int cpu)
 {
 	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
 
+	if (!ppmu)
+		return;
 	memset(cpuhw, 0, sizeof(*cpuhw));
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
-- 
cgit v0.10.2


From ae07b63f4b6728e1f98aa5c5416cfc1280f59f51 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Aug 2009 16:48:54 +0200
Subject: perf list: Fix the output to not include tracepoints without an id

Stop perf list from displaying tracepoints without an id file,
those are special tracepoints that are not interfaced to
perfcounters so listing them is erroneous and passing them as
events will produce no output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f77407b..4858d83 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -121,13 +121,29 @@ static unsigned long hw_cache_stat[C(MAX)] = {
 	   (strcmp(sys_dirent.d_name, ".")) &&				       \
 	   (strcmp(sys_dirent.d_name, "..")))
 
+static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
+{
+	char evt_path[MAXPATHLEN];
+	int fd;
+
+	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
+			sys_dir->d_name, evt_dir->d_name);
+	fd = open(evt_path, O_RDONLY);
+	if (fd < 0)
+		return -EINVAL;
+	close(fd);
+
+	return 0;
+}
+
 #define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st)    \
 	while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next)        \
 	if (snprintf(file, MAXPATHLEN, "%s/%s/%s", debugfs_path,	       \
 		     sys_dirent.d_name, evt_dirent.d_name) &&		       \
 	   (!stat(file, &st)) && (S_ISDIR(st.st_mode)) &&		       \
 	   (strcmp(evt_dirent.d_name, ".")) &&				       \
-	   (strcmp(evt_dirent.d_name, "..")))
+	   (strcmp(evt_dirent.d_name, "..")) &&				       \
+	   (!tp_event_has_id(&sys_dirent, &evt_dirent)))
 
 #define MAX_EVENT_LENGTH 30
 
-- 
cgit v0.10.2


From 7eac7e9e726c1b136bd7e0ad6671ce315f48bb18 Mon Sep 17 00:00:00 2001
From: Pierre Habouzit <pierre.habouzit@intersec.com>
Date: Fri, 7 Aug 2009 14:16:00 +0200
Subject: perf util: Fix do_read() to fail on EOF instead of busy-looping

While toying with perf, I've noticed that perf record can
easily enter a busy loop when doing something as silly as:

    $ perf record -A ls

Yeah, do_read here really wants to read a known size, not being
able to should die(), not busy-loop ;)

That was the cause for the bug.

Signed-off-by: Pierre Habouzit <pierre.habouzit@intersec.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 95a44bc..b92a457 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -185,6 +185,8 @@ static void do_read(int fd, void *buf, size_t size)
 
 		if (ret < 0)
 			die("failed to read");
+		if (ret == 0)
+			die("failed to read: missing data");
 
 		size -= ret;
 		buf += ret;
-- 
cgit v0.10.2


From 266e0e219888420a1a7cafc82e82891cf7b5a979 Mon Sep 17 00:00:00 2001
From: Pierre Habouzit <pierre.habouzit@intersec.com>
Date: Fri, 7 Aug 2009 14:16:01 +0200
Subject: perf record: Fix the -A UI for empty or non-existent perf.data

1. Ignore the -A argument if there is no perf.data file
2. Treat an empty file like a non existent file.

Else, perf will try to read the perf.data header, and fail with
an error.

Treating an empty file like a non-existent file makes sense,
since an interupted (as in SIGKILLed) perf could leave such
files around, and you don't want to annoy the user with errors
for files with no data in it.

Signed-off-by: Pierre Habouzit <pierre.habouzit@intersec.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 90c9808..0345aad 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -525,10 +525,14 @@ static int __cmd_record(int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
-	if (!stat(output_name, &st) && !force && !append_file) {
-		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
-				output_name);
-		exit(-1);
+	if (!stat(output_name, &st) && st.st_size) {
+		if (!force && !append_file) {
+			fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
+					output_name);
+			exit(-1);
+		}
+	} else {
+		append_file = 0;
 	}
 
 	flags = O_CREAT|O_RDWR;
-- 
cgit v0.10.2


From b0efe213f84f7fd5ccfe07053e3d9fb827b7c188 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 02:16:23 +0200
Subject: perf tools: callchain: Fix spurious 'perf report' warnings: ignore
 empty callchains

When the callchain tree comes to insert an empty backtrace, it
raises a spurious warning about the fact we are inserting an
empty. This is spurious because the radix tree assumes it did
something wrong to reach this state. But it didn't, we just met
an empty callchain that has to be ignored.

This happens occasionally with certain types of call-chain
recordings. If it happens it's a big nuisance as perf report
output starts with thousands of warning lines.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1249690585-9145-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 98c5627..a8e67aa 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -336,5 +336,7 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain,
 void append_chain(struct callchain_node *root, struct ip_callchain *chain,
 		  struct symbol **syms)
 {
+	if (!chain->nr)
+		return;
 	__append_chain_children(root, chain, syms, 0);
 }
-- 
cgit v0.10.2


From b1a88349c37624755b28ac3b3152b48f52c1f487 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 02:16:24 +0200
Subject: perf tools: callchain: Fix 'perf report' display to be callchain by
 default

If we recorded with -g option to record the callchain, right now
we require a -g option to perf report as well - and people reported
this as unnecessary complication: the user already specified -g
once, no need to require it a second time.

So if the recording includes call-chains, display the callchain by
default from perf report.

( The user can override this default using "-g none" option from
  perf report. )

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1249690585-9145-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index a5e2f8d..c4a8e10 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -68,7 +68,7 @@ static int		callchain;
 
 static
 struct callchain_param	callchain_param = {
-	.mode	= CHAIN_GRAPH_ABS,
+	.mode	= CHAIN_GRAPH_REL,
 	.min_percent = 0.5
 };
 
@@ -1836,6 +1836,13 @@ static int __cmd_report(void)
 					" -g?\n");
 			exit(-1);
 		}
+	} else if (callchain_param.mode != CHAIN_NONE && !callchain) {
+			callchain = 1;
+			if (register_callchain_param(&callchain_param) < 0) {
+				fprintf(stderr, "Can't register callchain"
+						" params\n");
+				exit(-1);
+			}
 	}
 
 	if (load_kernel() < 0) {
@@ -1974,6 +1981,13 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
 	else if (!strncmp(tok, "fractal", strlen(arg)))
 		callchain_param.mode = CHAIN_GRAPH_REL;
 
+	else if (!strncmp(tok, "none", strlen(arg))) {
+		callchain_param.mode = CHAIN_NONE;
+		callchain = 0;
+
+		return 0;
+	}
+
 	else
 		return -1;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index b2d128e..a926ae4 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -7,6 +7,7 @@
 #include "symbol.h"
 
 enum chain_mode {
+	CHAIN_NONE,
 	CHAIN_FLAT,
 	CHAIN_GRAPH_ABS,
 	CHAIN_GRAPH_REL
-- 
cgit v0.10.2


From 25446036cbfc2c89faacdb4fb4603943d2197dc6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 02:16:25 +0200
Subject: perf tools: callchain: Fix sum of percentages to be 100% by
 displaying amount of ignored chains in fractal mode

When we filter the callchains below a given percentage, we
ignore them and the end result only shows entries that have an
upper percentage than the filter threshold.

It seems to users then that we have an imbalance in the
percentage, as if the sum inside a profiled branch doesn't
reach 100%.

Since in the past there have been real perf report bugs that
showed the same sypmtom, it would be nice to assure the user
that the data is perfect and trustable and it all sums up to
100.00%.

So fix this by displaying the remaining hits that have been
filtered but without more detail than their amount in each
branches. Example while filtering below 50%:

7.73%  [k] delay_tsc
                |
                |--98.22%-- __const_udelay
                |          |
                |          |--86.37%-- ath5k_hw_register_timeout
                |          |          ath5k_hw_noise_floor_calibration
                |          |          ath5k_hw_reset
                |          |          ath5k_reset
                |          |          ath5k_config
                |          |          ieee80211_hw_config
                |          |          |
                |          |          |--88.53%-- ieee80211_scan_work
                |          |          |          worker_thread
                |          |          |          kthread
                |          |          |          child_rip
                |          |           --11.47%-- [...]
                |           --13.63%-- [...]
                 --1.78%-- [...]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1249690585-9145-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c4a8e10..99274ce 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -891,6 +891,21 @@ ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth,
 	return ret;
 }
 
+static struct symbol *rem_sq_bracket;
+static struct callchain_list rem_hits;
+
+static void init_rem_hits(void)
+{
+	rem_sq_bracket = malloc(sizeof(*rem_sq_bracket) + 6);
+	if (!rem_sq_bracket) {
+		fprintf(stderr, "Not enough memory to display remaining hits\n");
+		return;
+	}
+
+	strcpy(rem_sq_bracket->name, "[...]");
+	rem_hits.sym = rem_sq_bracket;
+}
+
 static size_t
 callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 			u64 total_samples, int depth, int depth_mask)
@@ -900,6 +915,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	struct callchain_list *chain;
 	int new_depth_mask = depth_mask;
 	u64 new_total;
+	u64 remaining;
 	size_t ret = 0;
 	int i;
 
@@ -908,17 +924,25 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	else
 		new_total = total_samples;
 
+	remaining = new_total;
+
 	node = rb_first(&self->rb_root);
 	while (node) {
+		u64 cumul;
+
 		child = rb_entry(node, struct callchain_node, rb_node);
+		cumul = cumul_hits(child);
+		remaining -= cumul;
 
 		/*
 		 * The depth mask manages the output of pipes that show
 		 * the depth. We don't want to keep the pipes of the current
-		 * level for the last child of this depth
+		 * level for the last child of this depth.
+		 * Except if we have remaining filtered hits. They will
+		 * supersede the last child
 		 */
 		next = rb_next(node);
-		if (!next)
+		if (!next && (callchain_param.mode != CHAIN_GRAPH_REL || !remaining))
 			new_depth_mask &= ~(1 << (depth - 1));
 
 		/*
@@ -933,7 +957,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 			ret += ipchain__fprintf_graph(fp, chain, depth,
 						      new_depth_mask, i++,
 						      new_total,
-						      cumul_hits(child));
+						      cumul);
 		}
 		ret += callchain__fprintf_graph(fp, child, new_total,
 						depth + 1,
@@ -941,6 +965,19 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		node = next;
 	}
 
+	if (callchain_param.mode == CHAIN_GRAPH_REL &&
+		remaining && remaining != new_total) {
+
+		if (!rem_sq_bracket)
+			return ret;
+
+		new_depth_mask &= ~(1 << (depth - 1));
+
+		ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
+					      new_depth_mask, 0, new_total,
+					      remaining);
+	}
+
 	return ret;
 }
 
@@ -1361,6 +1398,8 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	unsigned int width;
 	char *col_width = col_width_list_str;
 
+	init_rem_hits();
+
 	fprintf(fp, "# Samples: %Ld\n", (u64)total_samples);
 	fprintf(fp, "#\n");
 
@@ -1432,6 +1471,8 @@ print_entries:
 	}
 	fprintf(fp, "\n");
 
+	free(rem_sq_bracket);
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 04:26:35 +0200
Subject: perf_counter: Work around gcc warning by initializing tracepoint
 record unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Despite that the tracepoint record is always present when the
PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning,
thinking it might not be initialized:

  kernel/perf_counter.c: In function ‘perf_counter_output’:
  kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function

Then, initialize it to NULL and always check if it's not NULL
before dereference it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 615440a..117622c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_tracepoint_record *tp;
+	struct perf_tracepoint_record *tp = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 
 	if (sample_type & PERF_SAMPLE_TP_RECORD) {
 		tp = data->private;
-		header.size += tp->size;
+		if (tp)
+			header.size += tp->size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if (sample_type & PERF_SAMPLE_TP_RECORD)
+	if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp)
 		perf_output_copy(&handle, tp->record, tp->size);
 
 	perf_output_end(&handle);
-- 
cgit v0.10.2


From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 04:26:37 +0200
Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling

Based on Peter's comments, make tracepoint sampling generic
just like all the other sampling bits are. This is a rename
with no code changes:

- PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW
- struct perf_tracepoint_record to perf_raw_record

We want the system in place that transport tracepoints raw
samples events into the perf ring buffer to be generalized and
usable by any type of counter.

Reported-by; Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a67dd5c..2aabe43 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -121,7 +121,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
-	PERF_SAMPLE_TP_RECORD			= 1U << 10,
+	PERF_SAMPLE_RAW				= 1U << 10,
 
 	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
 };
@@ -414,9 +414,9 @@ struct perf_callchain_entry {
 	__u64				ip[PERF_MAX_STACK_DEPTH];
 };
 
-struct perf_tracepoint_record {
-	int				size;
-	char				*record;
+struct perf_raw_record {
+	u32				size;
+	void				*data;
 };
 
 struct task_struct;
@@ -687,7 +687,7 @@ struct perf_sample_data {
 	struct pt_regs			*regs;
 	u64				addr;
 	u64				period;
-	void				*private;
+	struct perf_raw_record		*raw;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 117622c..0023105 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_tracepoint_record *tp = NULL;
+	struct perf_raw_record *raw = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_TP_RECORD) {
-		tp = data->private;
-		if (tp)
-			header.size += tp->size;
+	if (sample_type & PERF_SAMPLE_RAW) {
+		raw = data->raw;
+		if (raw)
+			header.size += raw->size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp)
-		perf_output_copy(&handle, tp->record, tp->size);
+	if ((sample_type & PERF_SAMPLE_RAW) && raw)
+		perf_output_copy(&handle, raw->data, raw->size);
 
 	perf_output_end(&handle);
 }
@@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = {
 void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
 {
-	struct perf_tracepoint_record tp = {
+	struct perf_raw_record raw = {
 		.size = entry_size,
-		.record = record,
+		.data = record,
 	};
 
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
 		.addr = addr,
-		.private = &tp,
+		.raw = &raw,
 	};
 
 	if (!data.regs)
-- 
cgit v0.10.2


From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 7 Aug 2009 19:49:01 +0200
Subject: perf_counter: Fix a race on perf_counter_ctx

While extending perfcounters with BTS hw-tracing, Markus
Metzger managed to trigger this warning:

   [  995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b()

triggers because commit
9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full
task tracing) removed clearing of tsk->perf_counter_ctxp out
from under ctx->lock which introduced a race (against
perf_lock_task_context).

Move it back and deal with the exit notification by explicitly
passing along the former task context.

Reported-by: Markus T Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249667341.17467.5.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0023105..546e62d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter,
  */
 
 struct perf_task_event {
-	struct task_struct	*task;
+	struct task_struct		*task;
+	struct perf_counter_context	*task_ctx;
 
 	struct {
 		struct perf_event_header	header;
@@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
 static void perf_counter_task_event(struct perf_task_event *task_event)
 {
 	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
+	struct perf_counter_context *ctx = task_event->task_ctx;
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_task_ctx(&cpuctx->ctx, task_event);
 	put_cpu_var(perf_cpu_context);
 
 	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (!ctx)
+		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
 	if (ctx)
 		perf_counter_task_ctx(ctx, task_event);
 	rcu_read_unlock();
 }
 
-static void perf_counter_task(struct task_struct *task, int new)
+static void perf_counter_task(struct task_struct *task,
+			      struct perf_counter_context *task_ctx,
+			      int new)
 {
 	struct perf_task_event task_event;
 
@@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new)
 		return;
 
 	task_event = (struct perf_task_event){
-		.task	= task,
-		.event  = {
+		.task	  = task,
+		.task_ctx = task_ctx,
+		.event    = {
 			.header = {
 				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
 				.misc = 0,
@@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new)
 
 void perf_counter_fork(struct task_struct *task)
 {
-	perf_counter_task(task, 1);
+	perf_counter_task(task, NULL, 1);
 }
 
 /*
@@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	unsigned long flags;
 
 	if (likely(!child->perf_counter_ctxp)) {
-		perf_counter_task(child, 0);
+		perf_counter_task(child, NULL, 0);
 		return;
 	}
 
@@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	 * incremented the context's refcount before we do put_ctx below.
 	 */
 	spin_lock(&child_ctx->lock);
+	child->perf_counter_ctxp = NULL;
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
@@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	 * won't get any samples after PERF_EVENT_EXIT. We can however still
 	 * get a few PERF_EVENT_READ events.
 	 */
-	perf_counter_task(child, 0);
-
-	child->perf_counter_ctxp = NULL;
+	perf_counter_task(child, child_ctx, 0);
 
 	/*
 	 * We can recurse on the same lock type through:
-- 
cgit v0.10.2


From c24b513337f06712b8c81eb4f1413d44591dfee7 Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Wed, 5 Aug 2009 20:53:34 +0200
Subject: perf: "Longum est iter per praecepta, breve et efficax per exempla"

A few examples of how 'perf' can be used, from an e-mail by
Ingo Molnar http://lkml.org/lkml/2009/8/4/346.

Signed-off-by: Carlos R. Mafra <crmafra2@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <20090805185334.GA4535@Pilar.aei.mpg.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-examples.txt b/tools/perf/Documentation/perf-examples.txt
new file mode 100644
index 0000000..8eb6c48
--- /dev/null
+++ b/tools/perf/Documentation/perf-examples.txt
@@ -0,0 +1,225 @@
+
+		------------------------------
+		****** perf by examples ******
+		------------------------------
+
+[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ]
+
+
+First, discovery/enumeration of available counters can be done via
+'perf list':
+
+titan:~> perf list
+  [...]
+  kmem:kmalloc                             [Tracepoint event]
+  kmem:kmem_cache_alloc                    [Tracepoint event]
+  kmem:kmalloc_node                        [Tracepoint event]
+  kmem:kmem_cache_alloc_node               [Tracepoint event]
+  kmem:kfree                               [Tracepoint event]
+  kmem:kmem_cache_free                     [Tracepoint event]
+  kmem:mm_page_free_direct                 [Tracepoint event]
+  kmem:mm_pagevec_free                     [Tracepoint event]
+  kmem:mm_page_alloc                       [Tracepoint event]
+  kmem:mm_page_alloc_zone_locked           [Tracepoint event]
+  kmem:mm_page_pcpu_drain                  [Tracepoint event]
+  kmem:mm_page_alloc_extfrag               [Tracepoint event]
+
+Then any (or all) of the above event sources can be activated and
+measured. For example the page alloc/free properties of a 'hackbench
+run' are:
+
+ titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
+ -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10
+ Time: 0.575
+
+ Performance counter stats for './hackbench 10':
+
+          13857  kmem:mm_page_pcpu_drain
+          27576  kmem:mm_page_alloc
+           6025  kmem:mm_pagevec_free
+          20934  kmem:mm_page_free_direct
+
+    0.613972165  seconds time elapsed
+
+You can observe the statistical properties as well, by using the
+'repeat the workload N times' feature of perf stat:
+
+ titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
+   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
+   kmem:mm_page_free_direct ./hackbench 10
+ Time: 0.627
+ Time: 0.644
+ Time: 0.564
+ Time: 0.559
+ Time: 0.626
+
+ Performance counter stats for './hackbench 10' (5 runs):
+
+          12920  kmem:mm_page_pcpu_drain    ( +-   3.359% )
+          25035  kmem:mm_page_alloc         ( +-   3.783% )
+           6104  kmem:mm_pagevec_free       ( +-   0.934% )
+          18376  kmem:mm_page_free_direct   ( +-   4.941% )
+
+    0.643954516  seconds time elapsed   ( +-   2.363% )
+
+Furthermore, these tracepoints can be used to sample the workload as
+well. For example the page allocations done by a 'git gc' can be
+captured the following way:
+
+ titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc
+ Counting objects: 1148, done.
+ Delta compression using up to 2 threads.
+ Compressing objects: 100% (450/450), done.
+ Writing objects: 100% (1148/1148), done.
+ Total 1148 (delta 690), reused 1148 (delta 690)
+ [ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ]
+
+To check which functions generated page allocations:
+
+ titan:~/git> perf report
+ # Samples: 10646
+ #
+ # Overhead          Command               Shared Object
+ # ........  ...............  ..........................
+ #
+    23.57%       git-repack  /lib64/libc-2.5.so
+    21.81%              git  /lib64/libc-2.5.so
+    14.59%              git  ./git
+    11.79%       git-repack  ./git
+     7.12%              git  /lib64/ld-2.5.so
+     3.16%       git-repack  /lib64/libpthread-2.5.so
+     2.09%       git-repack  /bin/bash
+     1.97%               rm  /lib64/libc-2.5.so
+     1.39%               mv  /lib64/ld-2.5.so
+     1.37%               mv  /lib64/libc-2.5.so
+     1.12%       git-repack  /lib64/ld-2.5.so
+     0.95%               rm  /lib64/ld-2.5.so
+     0.90%  git-update-serv  /lib64/libc-2.5.so
+     0.73%  git-update-serv  /lib64/ld-2.5.so
+     0.68%             perf  /lib64/libpthread-2.5.so
+     0.64%       git-repack  /usr/lib64/libz.so.1.2.3
+
+Or to see it on a more finegrained level:
+
+titan:~/git> perf report --sort comm,dso,symbol
+# Samples: 10646
+#
+# Overhead          Command               Shared Object  Symbol
+# ........  ...............  ..........................  ......
+#
+     9.35%       git-repack  ./git                       [.] insert_obj_hash
+     9.12%              git  ./git                       [.] insert_obj_hash
+     7.31%              git  /lib64/libc-2.5.so          [.] memcpy
+     6.34%       git-repack  /lib64/libc-2.5.so          [.] _int_malloc
+     6.24%       git-repack  /lib64/libc-2.5.so          [.] memcpy
+     5.82%       git-repack  /lib64/libc-2.5.so          [.] __GI___fork
+     5.47%              git  /lib64/libc-2.5.so          [.] _int_malloc
+     2.99%              git  /lib64/libc-2.5.so          [.] memset
+
+Furthermore, call-graph sampling can be done too, of page
+allocations - to see precisely what kind of page allocations there
+are:
+
+ titan:~/git> perf record -f -g -e kmem:mm_page_alloc -c 1 ./git gc
+ Counting objects: 1148, done.
+ Delta compression using up to 2 threads.
+ Compressing objects: 100% (450/450), done.
+ Writing objects: 100% (1148/1148), done.
+ Total 1148 (delta 690), reused 1148 (delta 690)
+ [ perf record: Captured and wrote 0.963 MB perf.data (~42069 samples) ]
+
+ titan:~/git> perf report -g
+ # Samples: 10686
+ #
+ # Overhead          Command               Shared Object
+ # ........  ...............  ..........................
+ #
+    23.25%       git-repack  /lib64/libc-2.5.so
+                |
+                |--50.00%-- _int_free
+                |
+                |--37.50%-- __GI___fork
+                |          make_child
+                |
+                |--12.50%-- ptmalloc_unlock_all2
+                |          make_child
+                |
+                 --6.25%-- __GI_strcpy
+    21.61%              git  /lib64/libc-2.5.so
+                |
+                |--30.00%-- __GI_read
+                |          |
+                |           --83.33%-- git_config_from_file
+                |                     git_config
+                |                     |
+   [...]
+
+Or you can observe the whole system's page allocations for 10
+seconds:
+
+titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e
+kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
+kmem:mm_page_free_direct sleep 10
+
+ Performance counter stats for 'sleep 10':
+
+         171585  kmem:mm_page_pcpu_drain
+         322114  kmem:mm_page_alloc
+          73623  kmem:mm_pagevec_free
+         254115  kmem:mm_page_free_direct
+
+   10.000591410  seconds time elapsed
+
+Or observe how fluctuating the page allocations are, via statistical
+analysis done over ten 1-second intervals:
+
+ titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e
+   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
+   kmem:mm_page_free_direct sleep 1
+
+ Performance counter stats for 'sleep 1' (10 runs):
+
+          17254  kmem:mm_page_pcpu_drain    ( +-   3.709% )
+          34394  kmem:mm_page_alloc         ( +-   4.617% )
+           7509  kmem:mm_pagevec_free       ( +-   4.820% )
+          25653  kmem:mm_page_free_direct   ( +-   3.672% )
+
+    1.058135029  seconds time elapsed   ( +-   3.089% )
+
+Or you can annotate the recorded 'git gc' run on a per symbol basis
+and check which instructions/source-code generated page allocations:
+
+ titan:~/git> perf annotate __GI___fork
+ ------------------------------------------------
+  Percent |      Source code & Disassembly of libc-2.5.so
+ ------------------------------------------------
+          :
+          :
+          :      Disassembly of section .plt:
+          :      Disassembly of section .text:
+          :
+          :      00000031a2e95560 <__fork>:
+ [...]
+     0.00 :        31a2e95602:   b8 38 00 00 00          mov    $0x38,%eax
+     0.00 :        31a2e95607:   0f 05                   syscall
+    83.42 :        31a2e95609:   48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax
+     0.00 :        31a2e9560f:   0f 87 4d 01 00 00       ja     31a2e95762 <__fork+0x202>
+     0.00 :        31a2e95615:   85 c0                   test   %eax,%eax
+
+( this shows that 83.42% of __GI___fork's page allocations come from
+  the 0x38 system call it performs. )
+
+etc. etc. - a lot more is possible. I could list a dozen of
+other different usecases straight away - neither of which is
+possible via /proc/vmstat.
+
+/proc/vmstat is not in the same league really, in terms of
+expressive power of system analysis and performance
+analysis.
+
+All that the above results needed were those new tracepoints
+in include/tracing/events/kmem.h.
+
+	Ingo
+
+
-- 
cgit v0.10.2


From 183f3b0887083d36c8a25cd5e3518906415d1889 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sat, 8 Aug 2009 14:14:15 +0200
Subject: perf_counter tools: Fix libbfd detection for systems with libz
 dependency

Due to a libz dependency in some distro's binutils package,
C++ demangle support isn't compiled in despite the necessary
libraries being available.

Fix this by adding a -lz link test to the dependency detection
rules.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1249733655.6929.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 1916e44..60411e9 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -387,10 +387,14 @@ else
 
 	has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y")
 
+	has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y")
+
 	ifeq ($(has_bfd),y)
 		EXTLIBS += -lbfd
 	else ifeq ($(has_bfd_iberty),y)
 		EXTLIBS += -lbfd -liberty
+	else ifeq ($(has_bfd_iberty_z),y)
+		EXTLIBS += -lbfd -liberty -lz
 	else
 		msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling)
 		BASIC_CFLAGS += -DNO_DEMANGLE
-- 
cgit v0.10.2


From c0a8865e32c8d1a562db38e06ef31ef23282f646 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 9 Aug 2009 04:19:15 +0200
Subject: perf tools: callchain: Fix bad rounding of minimum rate

Sometimes we get callchain branches that have a rate under the
limit given by the user.

Say you launched:

 perf record -f -g -a ./hackbench 10
 perf report -g fractal,10.0

And you got:

2.33%       hackbench  [kernel]                  [k] _spin_lock_irqsave
                |
                |--78.57%-- remove_wait_queue
                |          poll_freewait
                |          do_sys_poll
                |          sys_poll
                |          sysenter_dispatch
                |          0xf7ffa430
                |          0x1ffadea3c
                |
                |--7.14%-- __up_read
                |          up_read
                |          do_page_fault
                |          page_fault
                |          0xf7ffa430
                |          0xa0df710000000a
                ...

It is abnormal to get a 7.14% branch whereas we passed a 10%
filter.

The problem is that we round down the minimum threshold. This
happens mostly when we have very low number of events. If the
total amount of your branch is 4 and you have a subranch of 3
events, filtering to 90% will be computed like follows:

  limit = 4 * 0.9;

The result is about 3.6, but the cast to integer will round
down to 3. It means that our filter is actually of 75%

We must then explicitly round up the minimum threshold.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: acme@redhat.com
Cc: peterz@infradead.org
Cc: efault@gmx.de
LKML-Reference: <20090809024235.GA10146@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index a8e67aa..0114734 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdbool.h>
 #include <errno.h>
+#include <math.h>
 
 #include "callchain.h"
 
@@ -112,7 +113,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
 	u64 min_hit;
 
 	node->rb_root = RB_ROOT;
-	min_hit = node->children_hit * min_percent / 100.0;
+	min_hit = ceil(node->children_hit * min_percent);
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_rel(child, min_percent);
@@ -126,7 +127,7 @@ static void
 sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root,
 		     u64 min_hit __used, struct callchain_param *param)
 {
-	__sort_chain_graph_rel(chain_root, param->min_percent);
+	__sort_chain_graph_rel(chain_root, param->min_percent / 100.0);
 	rb_root->rb_node = chain_root->rb_root.rb_node;
 }
 
-- 
cgit v0.10.2


From c8c00a6915a2e3d10416e8bdd3138429beb96210 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Aug 2009 12:50:52 +1000
Subject: Remove deadlock potential in md_open

A recent commit:
  commit 449aad3e25358812c43afc60918c5ad3819488e7

introduced the possibility of an A-B/B-A deadlock between
bd_mutex and reconfig_mutex.

__blkdev_get holds bd_mutex while calling md_open which takes
   reconfig_mutex,
do_md_run is always called with reconfig_mutex held, and it now
   takes bd_mutex in the call the revalidate_disk.

This potential deadlock was not caught by lockdep due to the
use of mutex_lock_interruptible_nexted which was introduced
by
   commit d63a5a74dee87883fda6b7d170244acaac5b05e8
do avoid a warning of an impossible deadlock.

It is quite possible to split reconfig_mutex in to two locks.
One protects the array data structures while it is being
reconfigured, the other ensures that an array is never even partially
open while it is being deactivated.
In particular, the second lock prevents an open from completing
between the time when do_md_stop checks if there are any active opens,
and the time when the array is either set read-only, or when ->pers is
set to NULL.  So we can be certain that no IO is in flight as the
array is being destroyed.

So create a new lock, open_mutex, just to ensure exclusion between
'open' and 'stop'.

This avoids the deadlock and also avoids the lockdep warning mentioned
in commit d63a5a74d

Reported-by: "Mike Snitzer" <snitzer@gmail.com>
Reported-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b98bea..5614500 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
+	mutex_init(&new->open_mutex);
 	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
@@ -4304,12 +4305,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	struct gendisk *disk = mddev->gendisk;
 	mdk_rdev_t *rdev;
 
+	mutex_lock(&mddev->open_mutex);
 	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
-		return -EBUSY;
-	}
-
-	if (mddev->pers) {
+		err = -EBUSY;
+	} else if (mddev->pers) {
 
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4367,7 +4367,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
-
+out:
+	mutex_unlock(&mddev->open_mutex);
+	if (err)
+		return err;
 	/*
 	 * Free resources if final stop
 	 */
@@ -4433,7 +4436,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	blk_integrity_unregister(disk);
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
-out:
 	return err;
 }
 
@@ -5518,12 +5520,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	}
 	BUG_ON(mddev != bdev->bd_disk->private_data);
 
-	if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
 		goto out;
 
 	err = 0;
 	atomic_inc(&mddev->openers);
-	mddev_unlock(mddev);
+	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
  out:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 78f0316..f8fc188 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
 							    * so we don't loop trying */
 
 	int				in_sync;	/* know to not need resync */
+	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+	 * that we are never stopping an array while it is open.
+	 * 'reconfig_mutex' protects all other reconfiguration.
+	 * These locks are separate due to conflicting interactions
+	 * with bdev->bd_mutex.
+	 * Lock ordering is:
+	 *  reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
+	 */
+	struct mutex			open_mutex;
 	struct mutex			reconfig_mutex;
 	atomic_t			active;		/* general refcount */
 	atomic_t			openers;	/* number of active opens */
-- 
cgit v0.10.2


From b2f2e8fee3d62f621e795f25b2fc0f51bbdb4af9 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 10 Aug 2009 16:36:38 +1000
Subject: powerpc/dma: pci_set_dma_mask() shouldn't fail if mask fits in RAM

On an iMac G5, the b43 driver is failing to initialise because trying to
set the dma mask to 30-bit fails. Even though there's only 512MiB of RAM
in the machine anyway:
	https://bugzilla.redhat.com/show_bug.cgi?id=514787

We should probably let it succeed if the available RAM in the system
doesn't exceed the requested limit.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 20a60d6..ccf129d 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -7,6 +7,7 @@
 
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/lmb.h>
 #include <asm/bug.h>
 #include <asm/abs_addr.h>
 
@@ -90,11 +91,10 @@ static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sg,
 static int dma_direct_dma_supported(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PPC64
-	/* Could be improved to check for memory though it better be
-	 * done via some global so platforms can set the limit in case
+	/* Could be improved so platforms can set the limit in case
 	 * they have limited DMA windows
 	 */
-	return mask >= DMA_BIT_MASK(32);
+	return mask >= (lmb_end_of_DRAM() - 1);
 #else
 	return 1;
 #endif
-- 
cgit v0.10.2


From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Sun, 9 Aug 2009 15:34:39 -0700
Subject: futex: Update futex_q lock_ptr on requeue proxy lock

futex_requeue() can acquire the lock on behalf of a waiter
early on or during the requeue loop if it is uncontended or in
the event of a lock steal or owner died. On wakeup, the waiter
(in futex_wait_requeue_pi()) cleans up the pi_state owner using
the lock_ptr to protect against concurrent access to the
pi_state. The pi_state is hung off futex_q's on the requeue
target futex hash bucket so the lock_ptr needs to be updated
accordingly.

The problem manifested by triggering the WARN_ON in
lookup_pi_state() about the pid != pi_state->owner->pid.  With
this patch, the pi_state is properly guarded against concurrent
access via the requeue target hb lock.

The astute reviewer may notice that there is a window of time
between when futex_requeue() unlocks the hb locks and when
futex_wait_requeue_pi() will acquire hb2->lock.  During this
time the pi_state and uval are not in sync with the underlying
rtmutex owner (but the uval does indicate there are waiters, so
no atomic changes will occur in userspace).  However, this is
not a problem. Should a contending thread enter
lookup_pi_state() and acquire hb2->lock before the ownership is
fixed up, it will find the pi_state hung off a waiter's
(possibly the pending owner's) futex_q and block on the
rtmutex.  Once futex_wait_requeue_pi() fixes up the owner, it
will also move the pi_state from the old owner's
task->pi_state_list to its own.

v3: Fix plist lock name for application to mainline (rather
    than -rt) Compile tested against tip/v2.6.31-rc5.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4A7F4EFF.6090903@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/futex.c b/kernel/futex.c
index 0672ff8..8cc3ee1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
  * q:	the futex_q
  * key:	the key of the requeue target futex
+ * hb:  the hash_bucket of the requeue target futex
  *
  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
  * to the requeue target futex so the waiter can detect the wakeup on the right
  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later.  Must be called
+ * with both q->lock_ptr and hb->lock held.
  */
 static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+			   struct futex_hash_bucket *hb)
 {
 	drop_futex_key_refs(&q->key);
 	get_futex_key_refs(key);
@@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
 
+	q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+	q->list.plist.lock = &hb->lock;
+#endif
+
 	wake_up_state(q->task, TASK_NORMAL);
 }
 
@@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
 				   set_waiters);
 	if (ret == 1)
-		requeue_pi_wake_futex(top_waiter, key2);
+		requeue_pi_wake_futex(top_waiter, key2, hb2);
 
 	return ret;
 }
@@ -1273,7 +1282,7 @@ retry_private:
 							this->task, 1);
 			if (ret == 1) {
 				/* We got the lock. */
-				requeue_pi_wake_futex(this, &key2);
+				requeue_pi_wake_futex(this, &key2, hb2);
 				continue;
 			} else if (ret) {
 				/* -EDEADLK */
-- 
cgit v0.10.2


From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 10 Aug 2009 11:16:52 +0200
Subject: perf_counter: Correct PERF_SAMPLE_RAW output

PERF_SAMPLE_* output switches should unconditionally output the
correct format, as they are the only way to unambiguously parse
the PERF_EVENT_SAMPLE data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249896447.17467.74.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2aabe43..a9d823a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -369,6 +369,8 @@ enum perf_event_type {
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 * };
 	 */
 	PERF_EVENT_SAMPLE		= 9,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 7fb16d9..7167b9b 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
-	__entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
+	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
+			     sizeof(u64));				\
 									\
 	do {								\
 		char raw_data[__entry_size];				\
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 546e62d..5229d16 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_raw_record *raw = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		raw = data->raw;
-		if (raw)
-			header.size += raw->size;
+		int size = sizeof(u32);
+
+		if (data->raw)
+			size += data->raw->size;
+		else
+			size += sizeof(u32);
+
+		WARN_ON_ONCE(size & (sizeof(u64)-1));
+		header.size += size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if ((sample_type & PERF_SAMPLE_RAW) && raw)
-		perf_output_copy(&handle, raw->data, raw->size);
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(&handle, data->raw->size);
+			perf_output_copy(&handle, data->raw->data, data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(&handle, raw);
+		}
+	}
 
 	perf_output_end(&handle);
 }
-- 
cgit v0.10.2


From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 10 Aug 2009 11:20:12 +0200
Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data

Raw tracepoint data contains various kernel internals and
data from other users, so restrict this to CAP_SYS_ADMIN.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249896452.17467.75.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5229d16..b0b20a0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
+	/*
+	 * Raw tracepoint data is a severe data leak, only allow root to
+	 * have these.
+	 */
+	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+			!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
 	if (ftrace_profile_enable(counter->attr.config))
 		return NULL;
 
-- 
cgit v0.10.2


From 100d5eb36ba20dc0b99a17ea2b9800c567bfc3d1 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 10 Aug 2009 11:55:51 +0200
Subject: ALSA: hda - Add missing vmaster initialization for ALC269

Without the initialization of vmaster NID, the dB information got
confused for ALC269 codec.

Reference: Novell bnc#527361
	https://bugzilla.novell.com/show_bug.cgi?id=527361

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Cc: <stable@kernel.org>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 51c44fd..5cc927f 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -13563,6 +13563,8 @@ static int patch_alc269(struct hda_codec *codec)
 		set_capture_mixer(spec);
 	set_beep_amp(spec, 0x0b, 0x04, HDA_INPUT);
 
+	spec->vmaster_nid = 0x02;
+
 	codec->patch_ops = alc_patch_ops;
 	if (board_config == ALC269_AUTO)
 		spec->init_hook = alc269_auto_init;
-- 
cgit v0.10.2


From 13f0feafa6b8aead57a2a328e2fca6a5828bf286 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jun 2009 21:25:32 +0200
Subject: mm_for_maps: simplify, use ptrace_may_access()

It would be nice to kill __ptrace_may_access(). It requires task_lock(),
but this lock is only needed to read mm->flags in the middle.

Convert mm_for_maps() to use ptrace_may_access(), this also simplifies
the code a little bit.

Also, we do not need to take ->mmap_sem in advance. In fact I think
mm_for_maps() should not play with ->mmap_sem at all, the caller should
take this lock.

With or without this patch, without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9..917f338 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	struct mm_struct *mm = get_task_mm(task);
 	if (!mm)
 		return NULL;
+	if (mm != current->mm) {
+		/*
+		 * task->mm can be changed before security check,
+		 * in that case we must notice the change after.
+		 */
+		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
+		    mm != task->mm) {
+			mmput(mm);
+			return NULL;
+		}
+	}
 	down_read(&mm->mmap_sem);
-	task_lock(task);
-	if (task->mm != mm)
-		goto out;
-	if (task->mm != current->mm &&
-	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
-		goto out;
-	task_unlock(task);
 	return mm;
-out:
-	task_unlock(task);
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return NULL;
 }
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-- 
cgit v0.10.2


From 00f89d218523b9bf6b522349c039d5ac80aa536d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:38 +0200
Subject: mm_for_maps: shift down_read(mmap_sem) to the caller

mm_for_maps() takes ->mmap_sem after security checks, this looks
strange and obfuscates the locking rules. Move this lock to its
single caller, m_start().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 917f338..f3c2e40 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
-	if (!mm)
-		return NULL;
-	if (mm != current->mm) {
+
+	if (mm && mm != current->mm) {
 		/*
 		 * task->mm can be changed before security check,
 		 * in that case we must notice the change after.
@@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
 		    mm != task->mm) {
 			mmput(mm);
-			return NULL;
+			mm = NULL;
 		}
 	}
-	down_read(&mm->mmap_sem);
 	return mm;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7c..9bd8be1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
+	down_read(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2..8f5c05d 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
+	down_read(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
-- 
cgit v0.10.2


From 704b836cbf19e885f8366bccb2e4b0474346c02d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:40 +0200
Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec

The problem is minor, but without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Now we do not need to re-check task->mm after ptrace_may_access(), it
can't be changed to the new mm under us.

Strictly speaking, this also fixes another very minor problem. Unless
security check fails or the task exits mm_for_maps() should never
return NULL, the caller should get either old or new ->mm.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3c2e40..175db25 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task)
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-	struct mm_struct *mm = get_task_mm(task);
+	struct mm_struct *mm;
 
-	if (mm && mm != current->mm) {
-		/*
-		 * task->mm can be changed before security check,
-		 * in that case we must notice the change after.
-		 */
-		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
-		    mm != task->mm) {
-			mmput(mm);
-			mm = NULL;
-		}
+	if (mutex_lock_killable(&task->cred_guard_mutex))
+		return NULL;
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		mmput(mm);
+		mm = NULL;
 	}
+	mutex_unlock(&task->cred_guard_mutex);
+
 	return mm;
 }
 
-- 
cgit v0.10.2


From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 10 Aug 2009 12:33:05 +0100
Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes

Give waitqueue spinlocks their own lockdep classes when they
are initialised from init_waitqueue_head().  This means that
struct wait_queue::func functions can operate other waitqueues.

This is used by CacheFiles to catch the page from a backing fs
being unlocked and to wake up another thread to take a copy of
it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Takashi Iwai <tiwai@suse.de>
Cc: linux-cachefs@redhat.com
Cc: torvalds@osdl.org
Cc: akpm@linux-foundation.org
LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6788e1a..cf3c2f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -77,7 +77,14 @@ struct task_struct;
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
 	{ .flags = word, .bit_nr = bit, }
 
-extern void init_waitqueue_head(wait_queue_head_t *q);
+extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *);
+
+#define init_waitqueue_head(q)				\
+	do {						\
+		static struct lock_class_key __key;	\
+							\
+		__init_waitqueue_head((q), &__key);	\
+	} while (0)
 
 #ifdef CONFIG_LOCKDEP
 # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4..c4bd3d8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
 
-void init_waitqueue_head(wait_queue_head_t *q)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
 {
 	spin_lock_init(&q->lock);
+	lockdep_set_class(&q->lock, key);
 	INIT_LIST_HEAD(&q->task_list);
 }
 
-EXPORT_SYMBOL(init_waitqueue_head);
+EXPORT_SYMBOL(__init_waitqueue_head);
 
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
-- 
cgit v0.10.2


From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001
From: Dinakar Guniguntala <dino@in.ibm.com>
Date: Mon, 10 Aug 2009 18:31:42 +0530
Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI

Need to add the REQUEUE_PI checks to the compat_sys_futex API
as well to ensure 32 bit requeue's work fine on a 64 bit
system. Patch is against latest tip

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Cc: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090810130142.GA23619@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b..2357165 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 	int cmd = op & FUTEX_CMD_MASK;
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-		      cmd == FUTEX_WAIT_BITSET)) {
+		      cmd == FUTEX_WAIT_BITSET ||
+		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
 		if (get_compat_timespec(&ts, utime))
 			return -EFAULT;
 		if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 			t = ktime_add_safe(ktime_get(), t);
 		tp = &t;
 	}
-	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
 		val2 = (int) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v0.10.2


From 304703aba31a87903b8c0db8f5e6890cac2d596d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 10 Aug 2009 16:11:32 +0200
Subject: perf_counter: Subtract the buffer size field from the event record
 size

We compute the perf raw sample size by aligning the raw ftrace
event size plus the buffer size field itself. We do that
instead of aligning only the perf raw sample size, so that we
might economize some in some cases.

But this buffer size field is not stored in the perf raw
sample, we must then substract its size from the buffer once we
computed the alignment unless we may get a useless u32 field in
the buffer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090810141129.GA5124@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 7167b9b..a05524f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -637,7 +637,12 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	pc = preempt_count();
  *
  *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
- *	__entry_size = __data_size + sizeof(*entry);
+ *
+ *	// Below we want to get the aligned size by taking into account
+ *	// the u32 field that will later store the buffer size
+ *	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),
+ *			     sizeof(u64));
+ *	__entry_size -= sizeof(u32);
  *
  *	do {
  *		char raw_data[__entry_size]; <- allocate our sample in the stack
@@ -687,6 +692,7 @@ static void ftrace_profile_##call(proto)				\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
 			     sizeof(u64));				\
+	__entry_size -= sizeof(u32);					\
 									\
 	do {								\
 		char raw_data[__entry_size];				\
-- 
cgit v0.10.2


From 1853db0e02ae4088f102b0d8e59e83dc98f93f03 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 10 Aug 2009 16:38:36 +0200
Subject: perf_counter: Zero dead bytes from ftrace raw samples size alignment

After aligning the ftrace raw samples, there are dead bytes storing
random data from the stack. We don't want to leak these to userspace,
then zero these out.

Before:

	0x2de88 [0x50]: event: 9
	.
	. ... raw event: size 80 bytes
	.  0000:  09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff  ......P........
	.  0010:  68 01 00 00 68 01 00 00 2c 00 00 00 00 00 00 00  h...h...,......
	.  0020:  2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00  ,...+...h...h..
	.  0030:  6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00  kondemand/0....
	.  0040:  68 01 00 00 40 7f 46 81 ff ff ff ff 00 10 1b 7f  h...@.F........
                                                      ^  ^  ^  ^
                                                         Leak

After:

	0x2d318 [0x50]: event: 9
	.
	. ... raw event: size 80 bytes
	.  0000:  09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff  ......P........
	.  0010:  68 01 00 00 68 01 00 00 68 14 00 00 00 00 00 00  h...h...h......
	.  0020:  2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00  ,...+...h...h..
	.  0030:  6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00  kondemand/0....
	.  0040:  68 01 00 00 a0 80 46 81 ff ff ff ff 00 00 00 00  h.....F........
                                                      ^  ^  ^  ^
							 Fixed

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1249915116-5210-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a05524f..f64fbaa 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -648,6 +648,9 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *		char raw_data[__entry_size]; <- allocate our sample in the stack
  *		struct trace_entry *ent;
  *
+ *		zero dead bytes from alignment to avoid stack leak to userspace:
+ *
+ *		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
  *		entry = (struct ftrace_raw_<call> *)raw_data;
  *		ent = &entry->ent;
  *		tracing_generic_entry_update(ent, irq_flags, pc);
@@ -698,6 +701,7 @@ static void ftrace_profile_##call(proto)				\
 		char raw_data[__entry_size];				\
 		struct trace_entry *ent;				\
 									\
+		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;	\
 		entry = (struct ftrace_raw_##call *)raw_data;		\
 		ent = &entry->ent;					\
 		tracing_generic_entry_update(ent, irq_flags, pc);	\
-- 
cgit v0.10.2


From 1392e3b33319fd1a2527bebfc56631c2f2d3c7c5 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 8 Aug 2009 17:52:50 +0900
Subject: documentation: register ioctl entry of nilfs2

This will register the ioctl range used by nilfs2 file system to the
table listed in Documentation/ioctl/ioctl-number.txt.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 7bb0d93..dbea4f9 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -139,6 +139,7 @@ Code	Seq#	Include File		Comments
 'm'	all	linux/synclink.h	conflict!
 'm'	00-1F	net/irda/irmod.h	conflict!
 'n'	00-7F	linux/ncp_fs.h
+'n'	80-8F	linux/nilfs2_fs.h	NILFS2
 'n'	E0-FF	video/matrox.h          matroxfb
 'o'	00-1F	fs/ocfs2/ocfs2_fs.h	OCFS2
 'o'     00-03   include/mtd/ubi-user.h  conflict! (OCFS2 and UBI overlaps)
-- 
cgit v0.10.2


From 5e2f89b5d5d87a7c3ba19fc85ba0c29adb65f639 Mon Sep 17 00:00:00 2001
From: "Figo.zhang" <figo1802@gmail.com>
Date: Sat, 8 Aug 2009 21:01:22 +0800
Subject: mempool.c: clean up type-casting

clean up type-casting twice.  "size_t" is typedef as "unsigned long" in
64-bit system, and "unsigned int" in 32-bit system, and the intermediate
cast to 'long' is pointless.

Signed-off-by: Figo.zhang <figo1802@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b..32e75d4 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-	size_t size = (size_t)(long)pool_data;
+	size_t size = (size_t)pool_data;
 	return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 
 void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
 {
-	size_t size = (size_t) pool_data;
+	size_t size = (size_t)pool_data;
 	return kzalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kzalloc);
-- 
cgit v0.10.2


From 04e35357e2e3ff4e0cabd6468354cf3dbfeb4f27 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Mon, 10 Aug 2009 16:45:42 +0100
Subject: MN10300: includecheck fix: mn10300, pci.h

Fix the following 'make includecheck' warning:

  arch/mn10300/include/asm/pci.h: linux/mm.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 35d2ed6..19aecc9 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -59,7 +59,6 @@ void pcibios_penalize_isa_irq(int irq);
 #include <linux/slab.h>
 #include <asm/scatterlist.h>
 #include <linux/string.h>
-#include <linux/mm.h>
 #include <asm/io.h>
 
 struct pci_dev;
-- 
cgit v0.10.2


From b6e61eef4f9f94714ac3ee4a5c96862d9bcd1836 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 31 Jul 2009 12:45:41 -0700
Subject: x86: Fix serialization in pit_expect_msb()

Wei Chong Tan reported a fast-PIT-calibration corner-case:

| pit_expect_msb() is vulnerable to SMI disturbance corner case
| in some platforms which causes /proc/cpuinfo to show wrong
| CPU MHz value when quick_pit_calibrate() jumps to success
| section.

I think that the real issue isn't even an SMI - but the fact
that in the very last iteration of the loop, there's no
serializing instruction _after_ the last 'rdtsc'. So even in
the absense of SMI's, we do have a situation where the cycle
counter was read without proper serialization.

The last check should be done outside the outer loop, since
_inside_ the outer loop, we'll be testing that the PIT has
the right MSB value has the right value in the next iteration.

So only the _last_ iteration is special, because that's the one
that will not check the PIT MSB value any more, and because the
final 'get_cycles()' isn't serialized.

In other words:

 - I'd like to move the PIT MSB check to after the last
   iteration, rather than in every iteration

 - I think we should comment on the fact that it's also a
   serializing instruction and so 'fences in' the TSC read.

Here's a suggested replacement.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: "Tan, Wei Chong" <wei.chong.tan@intel.com>
Tested-by: "Tan, Wei Chong" <wei.chong.tan@intel.com>
LKML-Reference: <B28277FD4E0F9247A3D55704C440A140D5D683F3@pgsmsx504.gar.corp.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6e1a368..71f4368 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
  * use the TSC value at the transitions to calculate a pretty
  * good value for the TSC frequencty.
  */
+static inline int pit_verify_msb(unsigned char val)
+{
+	/* Ignore LSB */
+	inb(0x42);
+	return inb(0x42) == val;
+}
+
 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
 {
 	int count;
 	u64 tsc = 0;
 
 	for (count = 0; count < 50000; count++) {
-		/* Ignore LSB */
-		inb(0x42);
-		if (inb(0x42) != val)
+		if (!pit_verify_msb(val))
 			break;
 		tsc = get_cycles();
 	}
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
 	 * to do that is to just read back the 16-bit counter
 	 * once from the PIT.
 	 */
-	inb(0x42);
-	inb(0x42);
+	pit_verify_msb(0);
 
 	if (pit_expect_msb(0xff, &tsc, &d1)) {
 		for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
 			 * Iterate until the error is less than 500 ppm
 			 */
 			delta -= tsc;
-			if (d1+d2 < delta >> 11)
-				goto success;
+			if (d1+d2 >= delta >> 11)
+				continue;
+
+			/*
+			 * Check the PIT one more time to verify that
+			 * all TSC reads were stable wrt the PIT.
+			 *
+			 * This also guarantees serialization of the
+			 * last cycle read ('d2') in pit_expect_msb.
+			 */
+			if (!pit_verify_msb(0xfe - i))
+				break;
+			goto success;
 		}
 	}
 	printk("Fast TSC calibration failed\n");
-- 
cgit v0.10.2


From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 7 Aug 2009 15:20:48 -0700
Subject: futex: Fix handling of bad requeue syscall pairing

If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call
other the futex_wait_requeue_pi(), the q.rt_waiter may be null.  If so,
this will result in an oops from the following call graph:

futex_requeue()
  rt_mutex_start_proxy_lock()
    task_blocks_on_rt_mutex()
      waiter->task dereference
        OOPS

We currently WARN_ON() if this is detected, clearly this is inadequate.
If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to
user-space.

V2: Fix parenthesis warnings.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4A7CA8C0.7010809@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/futex.c b/kernel/futex.c
index 8cc3ee1..e18cfbd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1256,8 +1256,15 @@ retry_private:
 		if (!match_futex(&this->key, &key1))
 			continue;
 
-		WARN_ON(!requeue_pi && this->rt_waiter);
-		WARN_ON(requeue_pi && !this->rt_waiter);
+		/*
+		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+		 * be paired with each other and no other futex ops.
+		 */
+		if ((requeue_pi && !this->rt_waiter) ||
+		    (!requeue_pi && this->rt_waiter)) {
+			ret = -EINVAL;
+			break;
+		}
 
 		/*
 		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
-- 
cgit v0.10.2


From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001
From: Shunichi Fuji <palglowr@gmail.com>
Date: Tue, 11 Aug 2009 03:34:40 +0900
Subject: x86: Add reboot quirk for every 5 series MacBook/Pro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5)
too. It seems all unibody MacBook and MacBookPro require
PCI reboot handling, i guess.

Following model/machine ID list shows unibody MacBook/Pro have
the 5 series of model number:

   http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html

Signed-off-by: Shunichi Fuji <palglowr@gmail.com>
Cc: Ozan Çağlayan <ozan@pardus.org.tr>
LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9eb8976..a06e8d1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
 }
 
 static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
-	{	/* Handle problems with rebooting on Apple MacBook5,2 */
+	{	/* Handle problems with rebooting on Apple MacBook5 */
 		.callback = set_pci_reboot,
-		.ident = "Apple MacBook",
+		.ident = "Apple MacBook5",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
 		},
 	},
-	{	/* Handle problems with rebooting on Apple MacBookPro5,1 */
+	{	/* Handle problems with rebooting on Apple MacBookPro5 */
 		.callback = set_pci_reboot,
-		.ident = "Apple MacBookPro5,1",
+		.ident = "Apple MacBookPro5",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
 		},
 	},
 	{ }
-- 
cgit v0.10.2


From b409d7a0ab46fe530efe52734984b4ed5d46c3eb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 6 Aug 2009 23:29:34 +0200
Subject: ocfs2: Fix possible deadlock when extending quota file

In OCFS2, allocator locks rank above transaction start. Thus we
cannot extend quota file from inside a transaction less we could
deadlock.

We solve the problem by starting transaction not already in
ocfs2_acquire_dquot() but only in ocfs2_local_read_dquot() and
ocfs2_global_read_dquot() and we allocate blocks to quota files before starting
the transaction.  In case we crash, quota files will just have a few blocks
more but that's no problem since we just use them next time we extend the
quota file.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4a4d3b5..2c3222a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -363,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 	return credits;
 }
 
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
-
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d604a6a..bf7742d 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -215,11 +215,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		loff_t rounded_end =
 				ocfs2_align_bytes_to_blocks(sb, off + len);
 
-		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		err = ocfs2_extend_no_holes(gqinode, rounded_end, off);
-		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		if (err < 0)
-			goto out;
+		/* Space is already allocated in ocfs2_global_read_dquot() */
 		err = ocfs2_simple_size_update(gqinode,
 					       oinfo->dqi_gqi_bh,
 					       rounded_end);
@@ -405,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
 	return err;
 }
 
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	/*
+	 * We may need to allocate tree blocks and a leaf block but not the
+	 * root block
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+	/* We modify all the allocated blocks, tree root, and info block */
+	return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
+
 /* Read in information from global quota file and acquire a reference to it.
  * dquot_acquire() has already started the transaction and locked quota file */
 int ocfs2_global_read_dquot(struct dquot *dquot)
 {
 	int err, err2, ex = 0;
-	struct ocfs2_mem_dqinfo *info =
-			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct inode *gqinode = info->dqi_gqinode;
+	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+	handle_t *handle = NULL;
 
 	err = ocfs2_qinfo_lock(info, 0);
 	if (err < 0)
@@ -422,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 	OCFS2_DQUOT(dquot)->dq_use_count++;
 	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
 	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	ocfs2_qinfo_unlock(info, 0);
+
 	if (!dquot->dq_off) {	/* No real quota entry? */
-		/* Upgrade to exclusive lock for allocation */
-		ocfs2_qinfo_unlock(info, 0);
-		err = ocfs2_qinfo_lock(info, 1);
-		if (err < 0)
-			goto out_qlock;
 		ex = 1;
+		/*
+		 * Add blocks to quota file before we start a transaction since
+		 * locking allocators ranks above a transaction start
+		 */
+		WARN_ON(journal_current_handle());
+		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		err = ocfs2_extend_no_holes(gqinode,
+			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+			gqinode->i_size);
+		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		if (err < 0)
+			goto out;
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_calc_global_qinit_credits(sb, type));
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
 	}
+	err = ocfs2_qinfo_lock(info, ex);
+	if (err < 0)
+		goto out_trans;
 	err = qtree_write_dquot(&info->dqi_gi, dquot);
 	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
 		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -441,6 +479,9 @@ out_qlock:
 		ocfs2_qinfo_unlock(info, 1);
 	else
 		ocfs2_qinfo_unlock(info, 0);
+out_trans:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
 out:
 	if (err < 0)
 		mlog_errno(err);
@@ -638,24 +679,17 @@ out:
 	return status;
 }
 
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 {
-	struct ocfs2_mem_dqinfo *oinfo;
-	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-
-	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-		return 0;
-
-	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 	/*
 	 * We modify tree, leaf block, global info, local chunk header,
 	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
 	 * accounts for inode update
 	 */
-	return oinfo->dqi_gi.dqi_qtree_depth +
+	return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+	       OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
 	       OCFS2_QINFO_WRITE_CREDITS +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
 	       OCFS2_INODE_UPDATE_CREDITS;
 }
 
@@ -688,36 +722,10 @@ out:
 	return status;
 }
 
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
-	struct ocfs2_mem_dqinfo *oinfo;
-	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-	struct ocfs2_dinode *lfe, *gfe;
-
-	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-		return 0;
-
-	oinfo = sb_dqinfo(sb, type)->dqi_priv;
-	gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
-	lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
-	/* We can extend local file + global file. In local file we
-	 * can modify info, chunk header block and dquot block. In
-	 * global file we can modify info, tree and leaf block */
-	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
-	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-	       OCFS2_LOCAL_QINFO_WRITE_CREDITS +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
-	       oinfo->dqi_gi.dqi_qtree_depth +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
-}
-
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-	handle_t *handle;
 	struct ocfs2_mem_dqinfo *oinfo =
 			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
 	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -726,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 	status = ocfs2_lock_global_qf(oinfo, 1);
 	if (status < 0)
 		goto out;
-	handle = ocfs2_start_trans(osb,
-		ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out_ilock;
-	}
 	status = dquot_acquire(dquot);
-	ocfs2_commit_trans(osb, handle);
-out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
 	mlog_exit(status);
-- 
cgit v0.10.2


From 85dfd81dc57e8183a277ddd7a56aa65c96f3f487 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 10 Aug 2009 13:21:19 -0700
Subject: pty: fix data loss when stopped (^S/^Q)

Commit d945cb9cc ("pty: Rework the pty layer to use the normal buffering
logic") dropped the test for 'tty->stopped' in pty_write_room(), which
then causes the n_tty line discipline thing to not throttle the data
properly when the tty is stopped.

So instead of pausing the write due to the tty being stopped, the ldisc
layer would go ahead and push it down to the pty.  The pty write()
routine would then refuse to take the data (because it _did_ check
'stopped'), and the data wouldn't actually be written.

This whole stopped test should eventually be moved into the tty ldisc
layer rather than have low-level tty drivers care about these things,
but right now the fix is to just re-instate the missing pty 'stopped'
handling.

Reported-and-tested-by: Artur Skawina <art.08.09@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 6e6942c..d083c73 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -144,6 +144,8 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf,
 
 static int pty_write_room(struct tty_struct *tty)
 {
+	if (tty->stopped)
+		return 0;
 	return pty_space(tty->link);
 }
 
-- 
cgit v0.10.2


From 651b1f125c7e3806bbd635739d009433dc07372d Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 10 Aug 2009 23:41:18 +0200
Subject: PM / Driver Core: Kill dev_pm_ops platform warning for now

Commit 783ea7d4eeefe895f2731fe73ac951e94418927b
(Driver Core: Rework platform suspend/resume, print warning)
added a warning message printed for platform drivers that use the
legacy PM callbacks rather than struct dev_pm_ops.  Unfortunately,
this resulted in some confusion and made some people try to convert
drivers by replacing the old callbacks with struct dev_pm_ops in
automatic way, which generally is not a good idea.

Remove the platform device runtime dev_pm_ops warning for now,
because it's annoying to users and it's not really necessary right
now.

[rjw: Modified the changelog to be more informative.]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 81cb01b..456594b 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -483,9 +483,6 @@ int platform_driver_register(struct platform_driver *drv)
 		drv->driver.remove = platform_drv_remove;
 	if (drv->shutdown)
 		drv->driver.shutdown = platform_drv_shutdown;
-	if (drv->suspend || drv->resume)
-		pr_warning("Platform driver '%s' needs updating - please use "
-			"dev_pm_ops\n", drv->driver.name);
 
 	return driver_register(&drv->driver);
 }
-- 
cgit v0.10.2


From 314dabb83a547ec4da819e8cbc78fac9cec605cd Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Mon, 10 Aug 2009 22:00:13 +1000
Subject: SELinux: fix memory leakage in /security/selinux/hooks.c

Fix memory leakage in /security/selinux/hooks.c

The buffer always needs to be freed here; we either error
out or allocate more memory.

Reported-by: iceberg <strakh@ispras.ru>
Signed-off-by: James Morris <jmorris@namei.org>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 15c2a08..1e8cfc4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1285,6 +1285,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 		rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX,
 					   context, len);
 		if (rc == -ERANGE) {
+			kfree(context);
+
 			/* Need a larger buffer.  Query for the right size. */
 			rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX,
 						   NULL, 0);
@@ -1292,7 +1294,6 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 				dput(dentry);
 				goto out_unlock;
 			}
-			kfree(context);
 			len = rc;
 			context = kmalloc(len+1, GFP_NOFS);
 			if (!context) {
-- 
cgit v0.10.2


From dd704698f56c1451fc9c5daadcd6e3a089de2c40 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 11 Aug 2009 08:45:11 +0200
Subject: ALSA: hda - Don't override ADC definitions for ALC codecs

ALC269 and ALC861-VD parsers override the ADC definitions
unconditionally without checking the spec definition.  This causes
the problem when any inconsistent ADC is set up in the device quirk
(like ALC272 with digital-mic).

This patch avoids the overriding by adding the proper checks.

Reference: Novell bnc#529467
	https://bugzilla.novell.com/show_bug.cgi?id=529467

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 5cc927f..fea9767 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -15579,9 +15579,12 @@ static int patch_alc861vd(struct hda_codec *codec)
 	spec->stream_digital_playback = &alc861vd_pcm_digital_playback;
 	spec->stream_digital_capture = &alc861vd_pcm_digital_capture;
 
-	spec->adc_nids = alc861vd_adc_nids;
-	spec->num_adc_nids = ARRAY_SIZE(alc861vd_adc_nids);
-	spec->capsrc_nids = alc861vd_capsrc_nids;
+	if (!spec->adc_nids) {
+		spec->adc_nids = alc861vd_adc_nids;
+		spec->num_adc_nids = ARRAY_SIZE(alc861vd_adc_nids);
+	}
+	if (!spec->capsrc_nids)
+		spec->capsrc_nids = alc861vd_capsrc_nids;
 
 	set_capture_mixer(spec);
 	set_beep_amp(spec, 0x0b, 0x05, HDA_INPUT);
@@ -17498,9 +17501,12 @@ static int patch_alc662(struct hda_codec *codec)
 	spec->stream_digital_playback = &alc662_pcm_digital_playback;
 	spec->stream_digital_capture = &alc662_pcm_digital_capture;
 
-	spec->adc_nids = alc662_adc_nids;
-	spec->num_adc_nids = ARRAY_SIZE(alc662_adc_nids);
-	spec->capsrc_nids = alc662_capsrc_nids;
+	if (!spec->adc_nids) {
+		spec->adc_nids = alc662_adc_nids;
+		spec->num_adc_nids = ARRAY_SIZE(alc662_adc_nids);
+	}
+	if (!spec->capsrc_nids)
+		spec->capsrc_nids = alc662_capsrc_nids;
 
 	if (!spec->cap_mixer)
 		set_capture_mixer(spec);
-- 
cgit v0.10.2


From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 9 Aug 2009 21:44:49 -0700
Subject: x86, mce: therm_throt - change when we print messages

My Latitude d630 seems to be handling thermal events in SMI by
lowering the max frequency of the CPU till it cools down but
still leaks the "everything is normal" events.

This spams the console and with high priority printks.

Adjust therm_throt driver to only print messages about the fact
that temperatire returned back to normal when leaving the
throttling state.

Also lower the severity of "back to normal" message from
KERN_CRIT to KERN_INFO.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Acked-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd1..8bc64cf 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -36,6 +36,7 @@
 
 static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+static DEFINE_PER_CPU(bool, thermal_throttle_active);
 
 static atomic_t therm_throt_en		= ATOMIC_INIT(0);
 
@@ -96,24 +97,27 @@ static int therm_throt_process(int curr)
 {
 	unsigned int cpu = smp_processor_id();
 	__u64 tmp_jiffs = get_jiffies_64();
+	bool was_throttled = __get_cpu_var(thermal_throttle_active);
+	bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
 
-	if (curr)
+	if (is_throttled)
 		__get_cpu_var(thermal_throttle_count)++;
 
-	if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+	if (!(was_throttled ^ is_throttled) &&
+	    time_before64(tmp_jiffs, __get_cpu_var(next_check)))
 		return 0;
 
 	__get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
 
 	/* if we just entered the thermal event */
-	if (curr) {
+	if (is_throttled) {
 		printk(KERN_CRIT "CPU%d: Temperature above threshold, "
-		       "cpu clock throttled (total events = %lu)\n", cpu,
-		       __get_cpu_var(thermal_throttle_count));
+		       "cpu clock throttled (total events = %lu)\n",
+		       cpu, __get_cpu_var(thermal_throttle_count));
 
 		add_taint(TAINT_MACHINE_CHECK);
-	} else {
-		printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+	} else if (was_throttled) {
+		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
 	}
 
 	return 1;
-- 
cgit v0.10.2


From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:47:36 +0200
Subject: perf_counter, x86: Fix lapic printk message

Instead of this garbled bootup on UP Pentium-M systems:

[    0.015048] Performance Counters:
[    0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only.

Print:

[    0.015050] Performance Counters:
[    0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it.
[    0.017003] no PMU driver, software counters only.

Cf: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a7aa8f9..40e233a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1590,7 +1590,7 @@ static int p6_pmu_init(void)
 	}
 
 	if (!cpu_has_apic) {
-		pr_info("no Local APIC, try rebooting with lapic");
+		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
 		return -ENODEV;
 	}
 
-- 
cgit v0.10.2


From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:26:33 +0200
Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs

Johannes Stezenbach reported that 'perf stat' does not count
cache-miss and cache-references events on his Pentium-M based
laptop.

This is because we left them blank in p6_perfmon_event_map[],
fill them in.

Reported-by: Johannes Stezenbach <js@sig21.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40e233a..fffc126 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] =
 {
   [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
   [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0000,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0000,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
-- 
cgit v0.10.2


From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Mon, 10 Aug 2009 19:56:45 -0300
Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag

Due to an erratum with certain AMD Athlon 64 processors, the
BIOS may need to force enable the LAHF_LM capability.
Unfortunately, in at least one case, the BIOS does this even
for processors that do not support the functionality.

Add a specific check that will clear the feature bit for
processors known not to support the LAHF/SAHF instructions.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Acked-by: Borislav Petkov <petkovbb@googlemail.com>
LKML-Reference: <4A80A5AD.2000209@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e2485b0..63fddcd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		level = cpuid_eax(1);
 		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
 			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+		/*
+		 * Some BIOSes incorrectly force this feature, but only K8
+		 * revision D (model = 0x14) and later actually support it.
+		 */
+		if (c->x86_model < 0x14)
+			clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
 	}
 	if (c->x86 == 0x10 || c->x86 == 0x11)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-- 
cgit v0.10.2


From 51b89f7a6615eca184aa0b85db5781d931e9c8d1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <[fenghua.yu@intel.com]>
Date: Tue, 11 Aug 2009 14:52:10 -0700
Subject: Bug Fix arch/ia64/kernel/pci-dma.c: fix recursive dma_supported()
 call in iommu_dma_supported()

In commit 160c1d8e40866edfeae7d68816b7005d70acf391,
dma_ops->dma_supported = iommu_dma_supported;

This dma_ops->dma_supported is first called in platform_dma_init() during kernel
boot. Then dma_ops->dma_supported will be called recursively in
iommu_dma_supported.

Kernel can not boot because kernel can not get out of iommu_dma_supported until
it runs out of stack memory.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index 0569596..f6b1ff0 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -69,11 +69,6 @@ iommu_dma_init(void)
 
 int iommu_dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-
-	if (ops->dma_supported)
-		return ops->dma_supported(dev, mask);
-
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
 	   The caller just has to use GFP_DMA in this case. */
-- 
cgit v0.10.2


From 8d6f9af91959256244878cd801c1c969e66cd093 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[hannes@cmpxchg.org]>
Date: Tue, 11 Aug 2009 14:52:10 -0700
Subject: ia64: boolean __test_and_clear_bit

__test_and_clear_bit() returns a bitfield with the tested-for bit set.
Make it consistent with the other bitops - of ia64 but also every
other architecture - and return a boolean value.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index e2ca800..57a2787 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -286,7 +286,7 @@ __test_and_clear_bit(int nr, volatile void * addr)
 {
 	__u32 *p = (__u32 *) addr + (nr >> 5);
 	__u32 m = 1 << (nr & 31);
-	int oldbitset = *p & m;
+	int oldbitset = (*p & m) != 0;
 
 	*p &= ~m;
 	return oldbitset;
-- 
cgit v0.10.2


From cfa5f809e399c699974ba6018eefa022bbc2e16e Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <[jaswinder@kernel.org]>
Date: Tue, 11 Aug 2009 14:52:10 -0700
Subject: IA64: includecheck fix: ia64, ia64_ksyms.c

fix the following 'make includecheck' warning:

  arch/ia64/kernel/ia64_ksyms.c: asm/page.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 2d31186..8ebccb5 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -21,6 +21,7 @@ EXPORT_SYMBOL(csum_ipv6_magic);
 
 #include <asm/page.h>
 EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(copy_page);
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 #include <linux/bootmem.h>
@@ -60,9 +61,6 @@ EXPORT_SYMBOL(__udivdi3);
 EXPORT_SYMBOL(__moddi3);
 EXPORT_SYMBOL(__umoddi3);
 
-#include <asm/page.h>
-EXPORT_SYMBOL(copy_page);
-
 #if defined(CONFIG_MD_RAID456) || defined(CONFIG_MD_RAID456_MODULE)
 extern void xor_ia64_2(void);
 extern void xor_ia64_3(void);
-- 
cgit v0.10.2


From b5a8879347bbe68bd24c8870503bf6a0362da26b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <[jaswinder@kernel.org]>
Date: Tue, 11 Aug 2009 14:52:11 -0700
Subject: IA64: includecheck fix: ia64, pgtable.h

fix the following 'make includecheck' warning:

  arch/ia64/include/asm/pgtable.h: asm/processor.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 0a9cc73..8840a69 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -155,7 +155,6 @@
 #include <linux/bitops.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
-#include <asm/processor.h>
 
 /*
  * Next come the mappings that determine how mmap() protection bits
-- 
cgit v0.10.2


From bf2a4c7270b9a22243a91ab5efcc47aaf997c66b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <[fenghua.yu@intel.com]>
Date: Tue, 11 Aug 2009 14:52:11 -0700
Subject: arch/ia64/Makefile: Remove -mtune=merced in IA64 kernel build

Between GCC version 3.4.0 and 4.3.3 (including 3.4.0 and 4.3.3), -mtune=merced
is implemented in GCC. Starting from 4.4.0, -mtune=merced is deprecated.

Even implemented in versions between 3.4.0 and 4.3.3, the -mtune=merced
feature has been broken in some of the versions. For example, GCC 4.1.2 reports
interanl tuning function errors during kernel building with -mtune=merced. Or
GCC Bugzilla 16130 reports another -mtune=merced issue on GCC 3.4.1.

So I would remove the -mtune=merced from IA64 kernel build. Without this option,
kernel on Merced will remain the same except losing an unstable and out-of-date
performance tunning feature.

Since GCC version 3.4.0, -mtune=mckinley has been implemented. The
-mtune=mckinley option functions the same as mtune=itanium2. And mtune=itanium2
is the default option. So we don't need to add mtune=mckinley either since its
been the default option in any GCC version which implements this option.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 58a7e46a..e7cbaa0 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -41,11 +41,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from
 		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
 endif
 
-ifeq ($(call cc-version),0304)
-	cflags-$(CONFIG_ITANIUM)	+= -mtune=merced
-	cflags-$(CONFIG_MCKINLEY)	+= -mtune=mckinley
-endif
-
 KBUILD_CFLAGS += $(cflags-y)
 head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
 
-- 
cgit v0.10.2


From 5359dffd4396f281c5b77de1acbee6fb1b333b23 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 11 Aug 2009 14:52:11 -0700
Subject: ia64/topology.c: exit cache_add_dev when kobject_init_and_add fails

Make cache_add_dev exit sysfs when kobject_init_and_add returns an error.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index bc80dff..8f06035 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -372,6 +372,10 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj,
 				      &cache_ktype_percpu_entry, &sys_dev->kobj,
 				      "%s", "cache");
+	if (unlikely(retval < 0)) {
+		cpu_cache_sysfs_exit(cpu);
+		return retval;
+	}
 
 	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) {
 		this_object = LEAF_KOBJECT_PTR(cpu,i);
@@ -385,7 +389,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 			}
 			kobject_put(&all_cpu_cache_info[cpu].kobj);
 			cpu_cache_sysfs_exit(cpu);
-			break;
+			return retval;
 		}
 		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
 	}
-- 
cgit v0.10.2


From e7369e01eb85550ed60dd1b0e120b69dfb03dc23 Mon Sep 17 00:00:00 2001
From: Roel Kluin <[roel.kluin@gmail.com]>
Date: Tue, 11 Aug 2009 14:52:11 -0700
Subject: arch/ia64/kernel/iosapic: missing test after ioremap()

Missing test after ioremap()

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>

diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index c48b03f..dab4d39 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -1072,6 +1072,10 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
 	}
 
 	addr = ioremap(phys_addr, 0);
+	if (addr == NULL) {
+		spin_unlock_irqrestore(&iosapic_lock, flags);
+		return -ENOMEM;
+	}
 	ver = iosapic_version(addr);
 	if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
 		iounmap(addr);
-- 
cgit v0.10.2


From 0cc6eee130b0c062feec8446d9cecdb17d2cfad3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:53 -0400
Subject: xfs: avoid memory allocation under m_peraglock in growfs code

Allocate the memory for the larger m_perag array before taking the
per-AG lock as the per-AG lock can be taken under the i_lock which
can be taken from reclaim context.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451b..2d0b3e1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
 	new = nb - mp->m_sb.sb_dblocks;
 	oagcount = mp->m_sb.sb_agcount;
 	if (nagcount > oagcount) {
+		void *new_perag, *old_perag;
+
 		xfs_filestream_flush(mp);
+
+		new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+					KM_MAYFAIL);
+		if (!new_perag)
+			return XFS_ERROR(ENOMEM);
+
 		down_write(&mp->m_peraglock);
-		mp->m_perag = kmem_realloc(mp->m_perag,
-			sizeof(xfs_perag_t) * nagcount,
-			sizeof(xfs_perag_t) * oagcount,
-			KM_SLEEP);
-		memset(&mp->m_perag[oagcount], 0,
-			(nagcount - oagcount) * sizeof(xfs_perag_t));
+		memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
+		old_perag = mp->m_perag;
+		mp->m_perag = new_perag;
+
 		mp->m_flags |= XFS_MOUNT_32BITINODES;
 		nagimax = xfs_initialize_perag(mp, nagcount);
 		up_write(&mp->m_peraglock);
+
+		kmem_free(old_perag);
 	}
 	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
 	tp->t_flags |= XFS_TRANS_RESERVE;
-- 
cgit v0.10.2


From ca35dcd6cae7d4a780c484c53f45548c4719f82c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:54 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_getbmap

xfs_getbmap allocates memory with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b99..8ee5b5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6009,7 +6009,7 @@ xfs_getbmap(
 	 */
 	error = ENOMEM;
 	subnex = 16;
-	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
 	if (!map)
 		goto out_unlock_ilock;
 
-- 
cgit v0.10.2


From f41d7fb9da05b604f8a69fb6cac2a0563c8ede4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:55 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_state_alloc

xfs_da_state_alloc is always called with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into the
filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57..bd0bb6d 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
 xfs_da_state_t *
 xfs_da_state_alloc(void)
 {
-	return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
 }
 
 /*
-- 
cgit v0.10.2


From 73195ed7864ae4a1fb0bea2ed9df59d19b4fde90 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:56 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_buf_make

i_lock is taken in the reclaim context so all allocations under it
must avoid recursions into the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index bd0bb6d..2847bbc 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
 	int		off;
 
 	if (nbuf == 1)
-		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
 	else
-		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
 	dabuf->dirty = 0;
 #ifdef XFS_DABUF_DEBUG
 	dabuf->ra = ra;
-- 
cgit v0.10.2


From 3f52c2f0a07c23771909cc53f2e9451a7f1bf253 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:57 -0400
Subject: xfs: switch to NOFS allocation under i_lock in
 xfs_dir_cilookup_result

xfs_dir_cilookup_result is always called with i_lock held, but i_lock is taken
in reclaim context so all allocations under it must avoid recursions into the
filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec..bb1d58e 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
 					!(args->op_flags & XFS_DA_OP_CILOOKUP))
 		return EEXIST;
 
-	args->value = kmem_alloc(len, KM_MAYFAIL);
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
 	if (!args->value)
 		return ENOMEM;
 
-- 
cgit v0.10.2


From 36fae17a648e0aee5d9560514d08477ef48dc87f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:58 -0400
Subject: xfs: switch to NOFS allocation under i_lock in
 xfs_buf_associate_memory

xfs_buf_associate_memory is used for setting up the spare buffer for the
log wrap case in xlog_sync which can happen under i_lock when called from
xfs_fsync. The i_lock mutex is taken in reclaim context so all allocations
under it must avoid recursions into the filesystem.  There are a couple
more uses of xfs_buf_associate_memory in the log recovery code that are
also affected by this, but I'd rather keep the code simple than passing on
a gfp_mask argument.  Longer term we should just stop requiring the memoery
allocation in xlog_sync by some smaller rework of the buffer layer.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 1418b91..178c20c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
 	bp->b_pages = NULL;
 	bp->b_addr = mem;
 
-	rval = _xfs_buf_get_pages(bp, page_count, 0);
+	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
 	if (rval)
 		return rval;
 
-- 
cgit v0.10.2


From 10746e47e722b5688fcd6eba9fbf9b2e64a248a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:59 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_set

xfs_attr_rmtval_set is always called with i_lock held, and i_lock is taken
in reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb..bfb5837 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2141,8 +2141,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
 		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
-		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
-							blkcnt, XFS_BUF_LOCK);
+		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
+				       XFS_BUF_LOCK | XBF_DONT_BLOCK);
 		ASSERT(bp);
 		ASSERT(!XFS_BUF_GETERROR(bp));
 
-- 
cgit v0.10.2


From 7b02ecb3031b192823bc732ae717febc0a59aa92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:15:00 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_readlink_bmap

xfs_readlink_bmap is called with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5e..492d75b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
 		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 
-		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+		bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+					XBF_LOCK | XBF_MAPPED |
+					XBF_DONT_BLOCK);
 		error = XFS_BUF_GETERROR(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_readlink",
-- 
cgit v0.10.2


From ddd3a14e0f030f0f7b900621f67532285b8657ef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:15:01 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_get

xfs_attr_rmtval_get is always called with i_lock held, but i_lock is taken
in reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index bfb5837..4ece190 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
 			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-					     blkcnt, XFS_BUF_LOCK, &bp);
+					     blkcnt,
+					     XFS_BUF_LOCK | XBF_DONT_BLOCK,
+					     &bp);
 			if (error)
 				return(error);
 
-- 
cgit v0.10.2


From e0c222c411e22f086e929cd69fdcc89336164ec1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 20 Jul 2009 10:52:15 -0500
Subject: use XFS_CORRUPTION_ERROR in xfs_btree_check_sblock

In Red Hat Bug 512552
 - Can't write to XFS mount during raid5 resync

a user ran into corruption while resyncing a raid, and we failed
a consistency test, but didn't get much more info; it'd be nice
to call XFS_CORRUPTION_ERROR here so we can see the buffer
contents.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df995..2671738 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
 			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 		if (bp)
 			xfs_buftrace("SBTREE ERROR", bp);
-		XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
-				 cur->bc_mp);
+		XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
+			XFS_ERRLEVEL_LOW, cur->bc_mp, block);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
-- 
cgit v0.10.2


From b89d4208de3de442c9025919c4261be0b38e79a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2009 11:32:18 -0300
Subject: xfs: check for dinode realtime flag corruption

Ramon tested XFS with a modified version of fsfuzzer and hit a NULL
pointer dereference in __xfs_get_blocks due to the RT device target
pointer being NULL.

To fix this reject inode with the realtime bit set on a a filesystem
without an RT subvolume during inode read.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Reported-by: Ramon de Carvalho Valle <ramon@risesecurity.org>
Tested-by: Ramon de Carvalho Valle <ramon@risesecurity.org>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65..da428b3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
+	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		     !ip->i_mount->m_rtdev_targp)) {
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, has realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
 	switch (ip->i_d.di_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
-- 
cgit v0.10.2


From a8914f3a6d72c97328597a556a99daaf5cc288ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2009 11:32:44 -0300
Subject: xfs: fix spin_is_locked assert on uni-processor builds

Without SMP or preemption spin_is_locked always returns false,
so we can't do an assert with it.  Instead use assert_spin_locked,
which does the right thing on all builds.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reported-by: Johannes Engel <jcnengel@googlemail.com>
Tested-by: Johannes Engel <jcnengel@googlemail.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04..9dbdff3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-	ASSERT(spin_is_locked(&log->l_icloglock));
+	assert_spin_locked(&log->l_icloglock);
 
 	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
 		xlog_state_switch_iclogs(log, iclog, 0);
-- 
cgit v0.10.2


From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Tue, 11 Aug 2009 20:00:11 +0200
Subject: x86: Fix oops in identify_cpu() on CPUs without CPUID

Kernel is broken for x86 CPUs without CPUID since 2.6.28. It
crashes with NULL pointer dereference in identify_cpu():

766        generic_identify(c);
767
768-->     if (this_cpu->c_identify)
769               this_cpu->c_identify(c);

this_cpu is NULL. This is because it's only initialized in
get_cpu_vendor() function, which is not called if the CPU has
no CPUID instruction.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
LKML-Reference: <200908112000.15993.linux@rainbow-software.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f1961c0..5ce60a8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
 	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
 
-static const struct cpu_dev *this_cpu __cpuinitdata;
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+	display_cacheinfo(c);
+#else
+	/* Not much we can do here... */
+	/* Check if at least it has cpuid */
+	if (c->cpuid_level == -1) {
+		/* No cpuid. It must be an ancient CPU */
+		if (c->x86 == 4)
+			strcpy(c->x86_model_id, "486");
+		else if (c->x86 == 3)
+			strcpy(c->x86_model_id, "386");
+	}
+#endif
+}
+
+static const struct cpu_dev __cpuinitconst default_cpu = {
+	.c_init		= default_init,
+	.c_vendor	= "Unknown",
+	.c_x86_vendor	= X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
 
 static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
 
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_64
-	display_cacheinfo(c);
-#else
-	/* Not much we can do here... */
-	/* Check if at least it has cpuid */
-	if (c->cpuid_level == -1) {
-		/* No cpuid. It must be an ancient CPU */
-		if (c->x86 == 4)
-			strcpy(c->x86_model_id, "486");
-		else if (c->x86 == 3)
-			strcpy(c->x86_model_id, "386");
-	}
-#endif
-}
-
-static const struct cpu_dev __cpuinitconst default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-	.c_x86_vendor = X86_VENDOR_UNKNOWN,
-};
-
 static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-- 
cgit v0.10.2


From df9eba8c9febf53782ef896518e7177999d98188 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 7 Aug 2009 11:15:20 +0900
Subject: pata_at91: fix resource release

Julias Lawall discovered that pata_at91 wasn't freeing a memory region
allocated with kzalloc() on init failure paths.  Upon review,
pata_at91 also seems to be doing unnecessary explicit resource
releases for managed resources too.  Convert memory allocation to
managed one and drop unnecessary explicit resource releases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Julia Lawall <julia@diku.dk>
Cc: Sergey Matyukevich <geomatsi@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/pata_at91.c b/drivers/ata/pata_at91.c
index 5702aff..41c94b1 100644
--- a/drivers/ata/pata_at91.c
+++ b/drivers/ata/pata_at91.c
@@ -250,7 +250,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev)
 		ata_port_desc(ap, "no IRQ, using PIO polling");
 	}
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
 
 	if (!info) {
 		dev_err(dev, "failed to allocate memory for private data\n");
@@ -275,7 +275,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev)
 	if (!info->ide_addr) {
 		dev_err(dev, "failed to map IO base\n");
 		ret = -ENOMEM;
-		goto err_ide_ioremap;
+		goto err_put;
 	}
 
 	info->alt_addr = devm_ioremap(dev,
@@ -284,7 +284,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev)
 	if (!info->alt_addr) {
 		dev_err(dev, "failed to map CTL base\n");
 		ret = -ENOMEM;
-		goto err_alt_ioremap;
+		goto err_put;
 	}
 
 	ap->ioaddr.cmd_addr = info->ide_addr;
@@ -303,13 +303,8 @@ static int __devinit pata_at91_probe(struct platform_device *pdev)
 			irq ? ata_sff_interrupt : NULL,
 			irq_flags, &pata_at91_sht);
 
-err_alt_ioremap:
-	devm_iounmap(dev, info->ide_addr);
-
-err_ide_ioremap:
+err_put:
 	clk_put(info->mck);
-	kfree(info);
-
 	return ret;
 }
 
@@ -317,7 +312,6 @@ static int __devexit pata_at91_remove(struct platform_device *pdev)
 {
 	struct ata_host *host = dev_get_drvdata(&pdev->dev);
 	struct at91_ide_info *info;
-	struct device *dev = &pdev->dev;
 
 	if (!host)
 		return 0;
@@ -328,11 +322,8 @@ static int __devexit pata_at91_remove(struct platform_device *pdev)
 	if (!info)
 		return 0;
 
-	devm_iounmap(dev, info->ide_addr);
-	devm_iounmap(dev, info->alt_addr);
 	clk_put(info->mck);
 
-	kfree(info);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1fd4bbec8c0d6db96b02141f324066afa2e77e89 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 6 Aug 2009 17:47:05 +0200
Subject: pata_atiixp: fix second channel support

PIO and MWDMA timings are never programmed for the second channel
because timing registers are treated as 16-bit long ones.

The bug is an attixp -> pata_atiixp regression and goes back to:

	commit 669a5db411d85a14f86cd92bc16bf7ab5b8aa235
	Author: Jeff Garzik <jeff@garzik.org>
	Date:   Tue Aug 29 18:12:40 2006 -0400

	    [libata] Add a bunch of PATA drivers.

Cc: Krystian Juskowiak <jusko@tlen.pl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bbpetkov@yahoo.de>
Cc: Robert Hancock <hancockrwd@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c
index bec0b8a..4591556 100644
--- a/drivers/ata/pata_atiixp.c
+++ b/drivers/ata/pata_atiixp.c
@@ -1,6 +1,7 @@
 /*
  * pata_atiixp.c 	- ATI PATA for new ATA layer
  *			  (C) 2005 Red Hat Inc
+ *			  (C) 2009 Bartlomiej Zolnierkiewicz
  *
  * Based on
  *
@@ -61,20 +62,19 @@ static void atiixp_set_pio_timing(struct ata_port *ap, struct ata_device *adev,
 
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	int dn = 2 * ap->port_no + adev->devno;
-
-	/* Check this is correct - the order is odd in both drivers */
 	int timing_shift = (16 * ap->port_no) + 8 * (adev->devno ^ 1);
-	u16 pio_mode_data, pio_timing_data;
+	u32 pio_timing_data;
+	u16 pio_mode_data;
 
 	pci_read_config_word(pdev, ATIIXP_IDE_PIO_MODE, &pio_mode_data);
 	pio_mode_data &= ~(0x7 << (4 * dn));
 	pio_mode_data |= pio << (4 * dn);
 	pci_write_config_word(pdev, ATIIXP_IDE_PIO_MODE, pio_mode_data);
 
-	pci_read_config_word(pdev, ATIIXP_IDE_PIO_TIMING, &pio_timing_data);
+	pci_read_config_dword(pdev, ATIIXP_IDE_PIO_TIMING, &pio_timing_data);
 	pio_timing_data &= ~(0xFF << timing_shift);
 	pio_timing_data |= (pio_timings[pio] << timing_shift);
-	pci_write_config_word(pdev, ATIIXP_IDE_PIO_TIMING, pio_timing_data);
+	pci_write_config_dword(pdev, ATIIXP_IDE_PIO_TIMING, pio_timing_data);
 }
 
 /**
@@ -119,16 +119,17 @@ static void atiixp_set_dmamode(struct ata_port *ap, struct ata_device *adev)
 		udma_mode_data |= dma << (4 * dn);
 		pci_write_config_word(pdev, ATIIXP_IDE_UDMA_MODE, udma_mode_data);
 	} else {
-		u16 mwdma_timing_data;
-		/* Check this is correct - the order is odd in both drivers */
 		int timing_shift = (16 * ap->port_no) + 8 * (adev->devno ^ 1);
+		u32 mwdma_timing_data;
 
 		dma -= XFER_MW_DMA_0;
 
-		pci_read_config_word(pdev, ATIIXP_IDE_MWDMA_TIMING, &mwdma_timing_data);
+		pci_read_config_dword(pdev, ATIIXP_IDE_MWDMA_TIMING,
+				      &mwdma_timing_data);
 		mwdma_timing_data &= ~(0xFF << timing_shift);
 		mwdma_timing_data |= (mwdma_timings[dma] << timing_shift);
-		pci_write_config_word(pdev, ATIIXP_IDE_MWDMA_TIMING, mwdma_timing_data);
+		pci_write_config_dword(pdev, ATIIXP_IDE_MWDMA_TIMING,
+				       mwdma_timing_data);
 	}
 	/*
 	 *	We must now look at the PIO mode situation. We may need to
-- 
cgit v0.10.2


From 7831387bda72af3059be48d39846d3eb6d8ce2f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 7 Aug 2009 01:59:15 +0900
Subject: libata: OCZ Vertex can't do HPA

OCZ Vertex SSD can't do HPA and not in a usual way.  It reports HPA,
allows unlocking but then fails all IOs which fall in the unlocked
area.  Quirk it so that HPA unlocking is not used for the device.

Reported by Daniel Perup in bnc#522414.

 https://bugzilla.novell.com/show_bug.cgi?id=522414

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Daniel Perup <probe@spray.se>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 8ac98ff..072ba5e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4302,6 +4302,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_HORKAGE_BROKEN_HPA },
 	{ "MAXTOR 6L080L4",	"A93.0500",	ATA_HORKAGE_BROKEN_HPA },
 
+	/* this one allows HPA unlocking but fails IOs on the area */
+	{ "OCZ-VERTEX",		    "1.30",	ATA_HORKAGE_BROKEN_HPA },
+
 	/* Devices which report 1 sector over size HPA */
 	{ "ST340823A",		NULL,		ATA_HORKAGE_HPA_SIZE, },
 	{ "ST320413A",		NULL,		ATA_HORKAGE_HPA_SIZE, },
-- 
cgit v0.10.2


From 51c8949950647afeeb897e08dd75ad99078adb50 Mon Sep 17 00:00:00 2001
From: Tony Vroon <tony@linx.net>
Date: Thu, 6 Aug 2009 00:50:09 +0100
Subject: sata_nv: MSI support, disabled by default

At least the nVidia MCP55 controller quite happily supports MSI.
This adds an option to use it. It is disabled by default.
As per feedback by Robert Hancock, it will honour the user
request as the kernel will not enable MSI where the controller
or the specific system configuration do not support it.

Signed-off-by: Tony Vroon <tony@linx.net>
Cc: Robert Hancock <hancockrwd@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index b2d11f3..86a4058 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -602,6 +602,7 @@ MODULE_VERSION(DRV_VERSION);
 
 static int adma_enabled;
 static int swncq_enabled = 1;
+static int msi_enabled;
 
 static void nv_adma_register_mode(struct ata_port *ap)
 {
@@ -2459,6 +2460,11 @@ static int nv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	} else if (type == SWNCQ)
 		nv_swncq_host_init(host);
 
+	if (msi_enabled) {
+		dev_printk(KERN_NOTICE, &pdev->dev, "Using MSI\n");
+		pci_enable_msi(pdev);
+	}
+
 	pci_set_master(pdev);
 	return ata_host_activate(host, pdev->irq, ipriv->irq_handler,
 				 IRQF_SHARED, ipriv->sht);
@@ -2558,4 +2564,6 @@ module_param_named(adma, adma_enabled, bool, 0444);
 MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: false)");
 module_param_named(swncq, swncq_enabled, bool, 0444);
 MODULE_PARM_DESC(swncq, "Enable use of SWNCQ (Default: true)");
+module_param_named(msi, msi_enabled, bool, 0444);
+MODULE_PARM_DESC(msi, "Enable use of MSI (Default: false)");
 
-- 
cgit v0.10.2


From 20308871588518b5e209c403de2a3ad9a2eba9af Mon Sep 17 00:00:00 2001
From: Michael Prokop <mika@grml.org>
Date: Thu, 6 Aug 2009 00:14:10 +0200
Subject: Documentation/kernel-parameters.txt: document libata's ignore_hpa
 option

By default the kernel honors the HPA (host protected area) of hard
drives.  Using libata's ignore_hpa module option it's possible to
change this behaviour.

Document usage and options of libata.ignore_hpa in
Documentation/kernel-parameters.txt.

Signed-off-by: Michael Prokop <mika@grml.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dd1a6d4..7936b80 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1115,6 +1115,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			libata.dma=4	  Compact Flash DMA only 
 			Combinations also work, so libata.dma=3 enables DMA
 			for disks and CDROMs, but not CFs.
+	
+	libata.ignore_hpa=	[LIBATA] Ignore HPA limit
+			libata.ignore_hpa=0	  keep BIOS limits (default)
+			libata.ignore_hpa=1	  ignore limits, using full disk
 
 	libata.noacpi	[LIBATA] Disables use of ACPI in libata suspend/resume
 			when set.
-- 
cgit v0.10.2


From b6931c1fbaf7fda9ea7f120228a96600d7090049 Mon Sep 17 00:00:00 2001
From: Shane Huang <shane.huang@amd.com>
Date: Wed, 5 Aug 2009 10:10:41 +0800
Subject: ahci: Soften up the dmesg on SB600 PMP softreset failure recovery

Too strong words led to spurious bug reports: Novell bugzilla #527748,
RedHat bugzilla #468800. This patch is used to soften up the dmesg on
SB600 PMP softreset failure recovery, so as to remove the scariness and
concern from community.

Reported-by: pgnet Dev <pgnet.dev@gmail.com>
Signed-off-by: Shane Huang <shane.huang@amd.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 958c1fa..838ff73 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1773,7 +1773,8 @@ static int ahci_sb600_softreset(struct ata_link *link, unsigned int *class,
 		irq_sts = readl(port_mmio + PORT_IRQ_STAT);
 		if (irq_sts & PORT_IRQ_BAD_PMP) {
 			ata_link_printk(link, KERN_WARNING,
-					"failed due to HW bug, retry pmp=0\n");
+					"applying SB600 PMP SRST workaround "
+					"and retrying\n");
 			rc = ahci_do_softreset(link, class, 0, deadline,
 					       ahci_check_ready);
 		}
-- 
cgit v0.10.2


From 5594639aab8b5614cb27a3e5b2b627505cbcd137 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 4 Aug 2009 14:30:08 +0900
Subject: ahci: add workaround for on-board 5723s on some gigabyte boards

Some gigabytes have on-board SIMG5723s connected to JMB ahcis.  These
are used to implement hardware raid.  Unfortunately some firmware
revisions on these 5723s don't bring the link down when all the
downstream ports are unoccupied while not responding to reset protocol
which makes libata think that there's device attached to the port but
is not responding and retry.  This results in painfully wrong boot
detection time for these ports when they're empty.

This patch quirks those boards such that ahci gives up after the
initial timeout.  Combined with parallel probing, this gives quick
enough probing and also is safe because SIMG5723 will respond to the
first try if any of the downstream ports is occupied.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Marc Bowes <marcbowes@gmail.com>
Reported-by: Nicolas Mailhot <Nicolas.Mailhot@LaPoste.net>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 838ff73..fe3eba5 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -219,6 +219,8 @@ enum {
 	AHCI_HFLAG_SECT255		= (1 << 8), /* max 255 sectors */
 	AHCI_HFLAG_YES_NCQ		= (1 << 9), /* force NCQ cap on */
 	AHCI_HFLAG_NO_SUSPEND		= (1 << 10), /* don't suspend */
+	AHCI_HFLAG_SRST_TOUT_IS_OFFLINE	= (1 << 11), /* treat SRST timeout as
+							link offline */
 
 	/* ap->flags bits */
 
@@ -1663,6 +1665,7 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class,
 			     int (*check_ready)(struct ata_link *link))
 {
 	struct ata_port *ap = link->ap;
+	struct ahci_host_priv *hpriv = ap->host->private_data;
 	const char *reason = NULL;
 	unsigned long now, msecs;
 	struct ata_taskfile tf;
@@ -1701,12 +1704,21 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class,
 
 	/* wait for link to become ready */
 	rc = ata_wait_after_reset(link, deadline, check_ready);
-	/* link occupied, -ENODEV too is an error */
-	if (rc) {
+	if (rc == -EBUSY && hpriv->flags & AHCI_HFLAG_SRST_TOUT_IS_OFFLINE) {
+		/*
+		 * Workaround for cases where link online status can't
+		 * be trusted.  Treat device readiness timeout as link
+		 * offline.
+		 */
+		ata_link_printk(link, KERN_INFO,
+				"device not ready, treating as offline\n");
+		*class = ATA_DEV_NONE;
+	} else if (rc) {
+		/* link occupied, -ENODEV too is an error */
 		reason = "device not ready";
 		goto fail;
-	}
-	*class = ahci_dev_classify(ap);
+	} else
+		*class = ahci_dev_classify(ap);
 
 	DPRINTK("EXIT, class=%u\n", *class);
 	return 0;
@@ -2727,6 +2739,56 @@ static bool ahci_broken_suspend(struct pci_dev *pdev)
 	return !ver || strcmp(ver, dmi->driver_data) < 0;
 }
 
+static bool ahci_broken_online(struct pci_dev *pdev)
+{
+#define ENCODE_BUSDEVFN(bus, slot, func)			\
+	(void *)(unsigned long)(((bus) << 8) | PCI_DEVFN((slot), (func)))
+	static const struct dmi_system_id sysids[] = {
+		/*
+		 * There are several gigabyte boards which use
+		 * SIMG5723s configured as hardware RAID.  Certain
+		 * 5723 firmware revisions shipped there keep the link
+		 * online but fail to answer properly to SRST or
+		 * IDENTIFY when no device is attached downstream
+		 * causing libata to retry quite a few times leading
+		 * to excessive detection delay.
+		 *
+		 * As these firmwares respond to the second reset try
+		 * with invalid device signature, considering unknown
+		 * sig as offline works around the problem acceptably.
+		 */
+		{
+			.ident = "EP45-DQ6",
+			.matches = {
+				DMI_MATCH(DMI_BOARD_VENDOR,
+					  "Gigabyte Technology Co., Ltd."),
+				DMI_MATCH(DMI_BOARD_NAME, "EP45-DQ6"),
+			},
+			.driver_data = ENCODE_BUSDEVFN(0x0a, 0x00, 0),
+		},
+		{
+			.ident = "EP45-DS5",
+			.matches = {
+				DMI_MATCH(DMI_BOARD_VENDOR,
+					  "Gigabyte Technology Co., Ltd."),
+				DMI_MATCH(DMI_BOARD_NAME, "EP45-DS5"),
+			},
+			.driver_data = ENCODE_BUSDEVFN(0x03, 0x00, 0),
+		},
+		{ }	/* terminate list */
+	};
+#undef ENCODE_BUSDEVFN
+	const struct dmi_system_id *dmi = dmi_first_match(sysids);
+	unsigned int val;
+
+	if (!dmi)
+		return false;
+
+	val = (unsigned long)dmi->driver_data;
+
+	return pdev->bus->number == (val >> 8) && pdev->devfn == (val & 0xff);
+}
+
 static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	static int printed_version;
@@ -2842,6 +2904,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			   "BIOS update required for suspend/resume\n");
 	}
 
+	if (ahci_broken_online(pdev)) {
+		hpriv->flags |= AHCI_HFLAG_SRST_TOUT_IS_OFFLINE;
+		dev_info(&pdev->dev,
+			 "online status unreliable, applying workaround\n");
+	}
+
 	/* CAP.NP sometimes indicate the index of the last enabled
 	 * port, at other times, that of the last possible port, so
 	 * determining the maximum port number requires looking at
-- 
cgit v0.10.2


From 67fe0688082509c52bd451d10a61b3565169c23e Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 12 Aug 2009 06:29:57 -0400
Subject: Remove zero-length file drivers/mtd/maps/sbc8240.c

It was "deleted" in commit 2bf961b7ccd69e108ac435c67e2b0522b403c578

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/mtd/maps/sbc8240.c b/drivers/mtd/maps/sbc8240.c
deleted file mode 100644
index e69de29..0000000
-- 
cgit v0.10.2


From 2a8083f063472f27c253545dd64e1a7bbbb1ab61 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Aug 2009 16:22:00 -0300
Subject: perf record: Fix .tid and .pid fill-in when synthesizing events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Noticed when trying to record events for a firefox thread. We
were synthesizing both .tid and .pid with the pid passed via
--pid.

Fix it by reading /proc/PID/status and getting the tgid
to use in .pid, .tid gets the specified "pid".

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090811192200.GF18061@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0345aad..30b83de 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -203,46 +203,48 @@ static void sig_atexit(void)
 	kill(getpid(), signr);
 }
 
-static void pid_synthesize_comm_event(pid_t pid, int full)
+static pid_t pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
 	char bf[BUFSIZ];
-	int fd;
-	size_t size;
-	char *field, *sep;
+	FILE *fp;
+	size_t size = 0;
 	DIR *tasks;
 	struct dirent dirent, *next;
+	pid_t tgid = 0;
 
-	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+	snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
 
-	fd = open(filename, O_RDONLY);
-	if (fd < 0) {
+	fp = fopen(filename, "r");
+	if (fd == NULL) {
 		/*
 		 * We raced with a task exiting - just return:
 		 */
 		if (verbose)
 			fprintf(stderr, "couldn't open %s\n", filename);
-		return;
+		return 0;
 	}
-	if (read(fd, bf, sizeof(bf)) < 0) {
-		fprintf(stderr, "couldn't read %s\n", filename);
-		exit(EXIT_FAILURE);
-	}
-	close(fd);
 
-	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
 	memset(&comm_ev, 0, sizeof(comm_ev));
-	field = strchr(bf, '(');
-	if (field == NULL)
-		goto out_failure;
-	sep = strchr(++field, ')');
-	if (sep == NULL)
-		goto out_failure;
-	size = sep - field;
-	memcpy(comm_ev.comm, field, size++);
-
-	comm_ev.pid = pid;
+	while (!comm_ev.comm[0] || !comm_ev.pid) {
+		if (fgets(bf, sizeof(bf), fp) == NULL)
+			goto out_failure;
+
+		if (memcmp(bf, "Name:", 5) == 0) {
+			char *name = bf + 5;
+			while (*name && isspace(*name))
+				++name;
+			size = strlen(name) - 1;
+			memcpy(comm_ev.comm, name, size++);
+		} else if (memcmp(bf, "Tgid:", 5) == 0) {
+			char *tgids = bf + 5;
+			while (*tgids && isspace(*tgids))
+				++tgids;
+			tgid = comm_ev.pid = atoi(tgids);
+		}
+	}
+
 	comm_ev.header.type = PERF_EVENT_COMM;
 	size = ALIGN(size, sizeof(u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
@@ -251,7 +253,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 		comm_ev.tid = pid;
 
 		write_output(&comm_ev, comm_ev.header.size);
-		return;
+		goto out_fclose;
 	}
 
 	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
@@ -268,7 +270,10 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 		write_output(&comm_ev, comm_ev.header.size);
 	}
 	closedir(tasks);
-	return;
+
+out_fclose:
+	fclose(fp);
+	return tgid;
 
 out_failure:
 	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
@@ -276,7 +281,7 @@ out_failure:
 	exit(EXIT_FAILURE);
 }
 
-static void pid_synthesize_mmap_samples(pid_t pid)
+static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -328,7 +333,7 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
-			mmap_ev.pid = pid;
+			mmap_ev.pid = tgid;
 			mmap_ev.tid = pid;
 
 			write_output(&mmap_ev, mmap_ev.header.size);
@@ -347,14 +352,14 @@ static void synthesize_all(void)
 
 	while (!readdir_r(proc, &dirent, &next) && next) {
 		char *end;
-		pid_t pid;
+		pid_t pid, tgid;
 
 		pid = strtol(dirent.d_name, &end, 10);
 		if (*end) /* only interested in proper numerical dirents */
 			continue;
 
-		pid_synthesize_comm_event(pid, 1);
-		pid_synthesize_mmap_samples(pid);
+		tgid = pid_synthesize_comm_event(pid, 1);
+		pid_synthesize_mmap_samples(pid, tgid);
 	}
 
 	closedir(proc);
@@ -567,8 +572,8 @@ static int __cmd_record(int argc, const char **argv)
 		perf_header__write(header, output);
 
 	if (!system_wide) {
-		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_samples(pid);
+		pid_t tgid = pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_samples(pid, tgid);
 	} else
 		synthesize_all();
 
-- 
cgit v0.10.2


From 94a24752fe95ca1e7f98b197052d44e6a207740d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Aug 2009 16:21:38 -0300
Subject: perf report: Show the tid too in -D
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This made it easier to find the firefox threading related
bug.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090811192138.GE18061@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 99274ce..23e1457 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1526,11 +1526,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n",
+	dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
-		event->ip.pid,
+		event->ip.pid, event->ip.tid,
 		(void *)(long)ip,
 		(long long)period);
 
@@ -1612,10 +1612,11 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 	struct thread *thread = threads__findnew(event->mmap.pid);
 	struct map *map = map__new(&event->mmap);
 
-	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->mmap.pid,
+		event->mmap.tid,
 		(void *)(long)event->mmap.start,
 		(void *)(long)event->mmap.len,
 		(void *)(long)event->mmap.pgoff,
-- 
cgit v0.10.2


From 247648e3742ded01e42a4b14c2da330b13cbb47f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Aug 2009 16:22:11 -0300
Subject: perf tools: Fix fallback to cplus_demangle() when bfd_demangle() is
 not available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In old binutils we can't access bfd_demangle(), use
cplus_demangle() just like oprofile.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Luis Claudio R. Gonçalves <lclaudio@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090811192211.GG18061@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 60411e9..c045b42 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -382,22 +382,29 @@ endif
 ifdef NO_DEMANGLE
 	BASIC_CFLAGS += -DNO_DEMANGLE
 else
-
 	has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd > /dev/null 2>&1 && echo y")
 
-	has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y")
-
-	has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y")
-
 	ifeq ($(has_bfd),y)
 		EXTLIBS += -lbfd
-	else ifeq ($(has_bfd_iberty),y)
-		EXTLIBS += -lbfd -liberty
-	else ifeq ($(has_bfd_iberty_z),y)
-		EXTLIBS += -lbfd -liberty -lz
 	else
-		msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling)
-		BASIC_CFLAGS += -DNO_DEMANGLE
+		has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y")
+		ifeq ($(has_bfd_iberty),y)
+			EXTLIBS += -lbfd -liberty
+		else
+			has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y")
+			ifeq ($(has_bfd_iberty_z),y)
+				EXTLIBS += -lbfd -liberty -lz
+			else
+				has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -liberty > /dev/null 2>&1 && echo y")
+				ifeq ($(has_cplus_demangle),y)
+					EXTLIBS += -liberty
+					BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
+				else
+					msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling)
+					BASIC_CFLAGS += -DNO_DEMANGLE
+				endif
+			endif
+		endif
 	endif
 endif
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index f1dcede..2473fd4 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -7,23 +7,8 @@
 #include <gelf.h>
 #include <elf.h>
 
-#ifndef NO_DEMANGLE
-#include <bfd.h>
-#else
-static inline
-char *bfd_demangle(void __used *v, const char __used *c, int __used i)
-{
-	return NULL;
-}
-#endif
-
 const char *sym_hist_filter;
 
-#ifndef DMGL_PARAMS
-#define DMGL_PARAMS      (1 << 0)       /* Include function args */
-#define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
-#endif
-
 enum dso_origin {
 	DSO__ORIG_KERNEL = 0,
 	DSO__ORIG_JAVA_JIT,
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 1e003ec..b53bf01 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -7,6 +7,30 @@
 #include <linux/rbtree.h>
 #include "module.h"
 
+#ifdef HAVE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+static inline char *bfd_demangle(void __used *v, const char *c, int i)
+{
+	return cplus_demangle(c, i);
+}
+#else
+#ifdef NO_DEMANGLE
+static inline char *bfd_demangle(void __used *v, const char __used *c,
+				 int __used i)
+{
+	return NULL;
+}
+#else
+#include <bfd.h>
+#endif
+#endif
+
+#ifndef DMGL_PARAMS
+#define DMGL_PARAMS      (1 << 0)       /* Include function args */
+#define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
+#endif
+
 struct symbol {
 	struct rb_node	rb_node;
 	u64		start;
-- 
cgit v0.10.2


From 1340e6bbaff7ff7f6f75eb4a5c34933efce84a84 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Aug 2009 17:04:36 -0300
Subject: perf tools: Fix dso__new handle() to handle deleted DSOs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is better than showing the map addr, this way at least we
know that we can't get the symtabs because the DSO was deleted
(system update) while an app still used such DSO.

Yeah, don't do that, but if you do, you'll figure it out
quicker this way.

[acme@doppio linux-2.6-tip]$ perf report | head -15
 # Samples: 3796
 #
 # Overhead  Command                                                        Shared Object  Symbol
 # ........  .......  ...................................................................  ......
 #
    23.55%   pidgin  /lib64/libglib-2.0.so.0.2000.4.#prelink#.Pd98lu (deleted)            [.] 0x00000000038844
    21.55%   pidgin  /lib64/libpthread-2.10.1.so.#prelink#.AFwK8Q (deleted)               [.] 0x0000000000a42d
    10.85%   pidgin  [kernel]                                                             [.] vread_hpet
     7.85%   pidgin  /lib64/libgobject-2.0.so.0.2000.4.#prelink#.o1vpU7 (deleted)         [.] 0x00000000014de8
     3.35%   pidgin  /lib64/libc-2.10.1.so (deleted)                                      [.] 0x0000000007a875
     3.19%   pidgin  /lib64/libdbus-1.so.3.4.0.#prelink#.6mwgZP (deleted)                 [.] 0x0000000001d254
     3.06%   pidgin  /usr/lib64/libgtk-x11-2.0.so.0.1600.5.#prelink#.511hAl (deleted)     [.] 0x000000002334e7
     2.90%   pidgin  /usr/lib64/libgdk-x11-2.0.so.0.1600.5.#prelink#.5qlMo1 (deleted)     [.] 0x00000000037b2d
     1.84%   pidgin  [kernel]                                                             [k] do_sys_poll
     1.45%   pidgin  /usr/lib64/libX11.so.6.2.0.#prelink#.iR59Rx (deleted)                [.] 0x0000000004c751
[acme@doppio linux-2.6-tip]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Luis Claudio R. Gonçalves <lclaudio@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090811200436.GA3478@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 2473fd4..5c0f42e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -801,6 +801,8 @@ more:
 	}
 out:
 	free(name);
+	if (ret < 0 && strstr(self->name, " (deleted)") != NULL)
+		return 0;
 	return ret;
 }
 
-- 
cgit v0.10.2


From 0a5ac84650fb7a7f226814103d95724e34b012ae Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 12 Aug 2009 11:18:01 +0200
Subject: perf record: Add missing -C option support for specifying profile cpu

perf top supports a -C for setting the profile CPU, but perf
record does not. This adds the same option for perf record,
allowing the user to specify a specific target profile CPU.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090812091801.GC12579@kernel.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 30b83de..de76008 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -35,6 +35,7 @@ static const char		*output_name			= "perf.data";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
+static int			profile_cpu			= -1;
 static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
@@ -431,6 +432,8 @@ try_again:
 
 		if (err == EPERM)
 			die("Permission error - are you root?\n");
+		else if (err ==  ENODEV && profile_cpu != -1)
+			die("No such device - did you specify an out-of-range profile CPU?\n");
 
 		/*
 		 * If it's cycles then fall back to hrtimer
@@ -564,9 +567,15 @@ static int __cmd_record(int argc, const char **argv)
 		if (pid == -1)
 			pid = getpid();
 
-		open_counters(-1, pid);
-	} else for (i = 0; i < nr_cpus; i++)
-		open_counters(i, target_pid);
+		open_counters(profile_cpu, pid);
+	} else {
+		if (profile_cpu != -1) {
+			open_counters(profile_cpu, target_pid);
+		} else {
+			for (i = 0; i < nr_cpus; i++)
+				open_counters(i, target_pid);
+		}
+	}
 
 	if (file_new)
 		perf_header__write(header, output);
@@ -645,6 +654,8 @@ static const struct option options[] = {
 			    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('A', "append", &append_file,
 			    "append to the output file to do incremental profiling"),
+	OPT_INTEGER('C', "profile_cpu", &profile_cpu,
+			    "CPU to profile on"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file"),
 	OPT_LONG('c', "count", &default_interval,
-- 
cgit v0.10.2


From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:40:08 +0200
Subject: perf_counter, x86: Fix/improve apic fallback

Johannes Stezenbach reported that his Pentium-M based
laptop does not have the local APIC enabled by default,
and hence perfcounters do not get initialized.

Add a fallback for this case: allow non-sampled counters
and return with an error on sampled counters. This allows
'perf stat' to work out of box - and allows 'perf top'
and 'perf record' to fall back on a hrtimer based sampling
method.

( Passing 'lapic' on the boot line will allow hardware
  sampling to occur - but if the APIC is disabled
  permanently by the hardware then this fallback still
  allows more systems to use perfcounters. )

Also decouple perfcounter support from X86_LOCAL_APIC.

-v2: fix typo breaking counters on all other systems ...

Reported-by: Johannes Stezenbach <js@sig21.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 738bdc6..13ffa5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -742,7 +743,6 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
-	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fffc126..900332b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -55,6 +55,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	int		apic;
 	u64		max_period;
 	u64		intel_ctrl;
 };
@@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void)
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
+#endif
 
 	return true;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -644,10 +648,12 @@ perfctr_fail:
 		enable_lapic_nmi_watchdog();
 
 	return false;
+#endif
 }
 
 static void release_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -657,6 +663,7 @@ static void release_pmc_hardware(void)
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
+#endif
 }
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * counters (user-space has to fall back and
+		 * sample via a hrtimer based software counter):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
 
 void set_perf_counter_pending(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
 }
 
 void perf_counters_lapic_init(void)
 {
-	if (!x86_pmu_initialized())
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
 	/*
 	 * Always use NMI for PMU
 	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
 }
 
 static int __kprobes
@@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
 	/*
 	 * Can't rely on the handled return value to say it was our NMI, two
 	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = {
 	.event_map		= p6_pmu_event_map,
 	.raw_event		= p6_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
 	.version		= 0,
 	.num_counters		= 2,
@@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
 };
@@ -1589,13 +1614,14 @@ static int p6_pmu_init(void)
 		return -ENODEV;
 	}
 
+	x86_pmu = p6_pmu;
+
 	if (!cpu_has_apic) {
 		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-		return -ENODEV;
+		pr_info("no hardware sampling interrupt available.\n");
+		x86_pmu.apic = 0;
 	}
 
-	x86_pmu				= p6_pmu;
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Aug 2009 09:12:30 -0400
Subject: NFS: Fix an O_DIRECT Oops...

We can't call nfs_readdata_release()/nfs_writedata_release() without
first initialising and referencing args.context. Doing so inside
nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment()
causes an Oops.

We should rather be calling nfs_readdata_free()/nfs_writedata_free() in
those cases.

Looking at the O_DIRECT code, the "struct nfs_direct_req" is already
referencing the nfs_open_context for us. Since the readdata and writedata
structures carry a reference to that, we can simplify things by getting rid
of the extra nfs_open_context references, so that we can replace all
instances of nfs_readdata_release()/nfs_writedata_release().

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01..e4e089a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	nfs_readdata_release(calldata);
+	nfs_readdata_free(data);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 					data->npages, 1, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_readdata_release(data);
+			nfs_readdata_free(data);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_readdata_release(data);
+				nfs_readdata_free(data);
 				break;
 			}
 			bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = get_nfs_open_context(ctx);
+		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
 		nfs_direct_release_pages(data->pagevec, data->npages);
-		nfs_writedata_release(data);
+		nfs_writedata_free(data);
 	}
 }
 
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
 
 	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
 	nfs_direct_write_complete(dreq, data->inode);
-	nfs_commitdata_release(calldata);
+	nfs_commit_free(data);
 }
 
 static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.fh = NFS_FH(data->inode);
 	data->args.offset = 0;
 	data->args.count = 0;
-	data->args.context = get_nfs_open_context(dreq->ctx);
+	data->args.context = dreq->ctx;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 					data->npages, 0, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_writedata_release(data);
+			nfs_writedata_free(data);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_writedata_release(data);
+				nfs_writedata_free(data);
 				break;
 			}
 			bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = get_nfs_open_context(ctx);
+		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8..12c9e66 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_rdata_mempool);
 }
 
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-	struct nfs_read_data *rdata = data;
-
 	put_nfs_open_context(rdata->args.context);
 	nfs_readdata_free(rdata);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff..a34fae2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-	struct nfs_write_data *wdata = data;
-
 	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fdffb41..f6b9024 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -473,7 +473,6 @@ extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
-extern void nfs_writedata_release(void *);
 
 /*
  * Try to write back everything synchronously (but check the
@@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_write_data *wdata);
-extern void nfs_commitdata_release(void *wdata);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
@@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode)
  * Allocate nfs_write_data structures
  */
 extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages);
+extern void nfs_writedata_free(struct nfs_write_data *);
 
 /*
  * linux/fs/nfs/read.c
@@ -515,7 +514,6 @@ extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
-extern void nfs_readdata_release(void *data);
 extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
@@ -523,6 +521,7 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
  * Allocate nfs_read_data structures
  */
 extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages);
+extern void nfs_readdata_free(struct nfs_read_data *);
 
 /*
  * linux/fs/nfs3proc.c
-- 
cgit v0.10.2


From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <alan.brunelle@hp.com>
Date: Fri, 7 Aug 2009 12:01:08 -0400
Subject: Remove double removal of blktrace directory

commit fd51d251e4cdb21f68e9dbc4336514d64a105a79
Author: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date:   Tue May 19 09:59:08 2009 +0200

    blktrace: remove debugfs entries on bad path

added in an explicit invocation of debugfs_remove for bt->dir, in
blk_remove_buf_file_callback we are also getting the directory removed. On
occasion I am seeing memory corruption that I have bisected down to
this commit. [The testing involves a (long) series of I/O benchmarks
with blktrace invoked around the actual runs.] I believe that this
committed patch is correct, but the problem actually lies in the code
in blk_remove_buf_file_callback.

With this patch I am able to consistently get complete runs whereas
previously I could not get a single run to complete.

The first part of the patch simply moves the debugfs_remove below the
relay_close: the relay_close call will remove files under bt->dir, and
so we should not remove the directory until all the files we created
have been removed. (Note: This is not sufficient to fix the problem -
the file system code has ref counts on the directoy, so our invocation
does not cause the directory to actually be removed. Nonetheless, we
should not rely upon that feature.)

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1090b0a..7a34cb5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
-	debugfs_remove(bt->dir);
 	relay_close(bt->rchan);
+	debugfs_remove(bt->dir);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
@@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 
 static int blk_remove_buf_file_callback(struct dentry *dentry)
 {
-	struct dentry *parent = dentry->d_parent;
 	debugfs_remove(dentry);
 
-	/*
-	* this will fail for all but the last file, but that is ok. what we
-	* care about is the top level buts->name directory going away, when
-	* the last trace file is gone. Then we don't have to rmdir() that
-	* manually on trace stop, so it nicely solves the issue with
-	* force killing of running traces.
-	*/
-
-	debugfs_remove(parent);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 51d5668cb2e3fd1827a55184e48606fff054c5be Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 09:54:02 +1000
Subject: md: never advance 'events' counter by more than 1.

When assembling arrays, md allows two devices to have different event
counts as long as the difference is only '1'.  This is to cope with
a system failure between updating the metadata on two difference
devices.

However there are currently times when we update the event count by
2.  This was done to keep the event count even when the array is clean
and odd when it is dirty, which allows us to avoid writing common
update to spare devices and so allow those spares to go to sleep.

This is bad for the above reason.  So change it to never increase by
two.  This means that the alignment between 'odd/even' and
'clean/dirty' might take a little longer to attain, but that is only a
small cost.  The spares will get a few more updates but that will
still be spared (;-) most updates and can still go to sleep.

Prior to this patch there was a small chance that after a crash an
array would fail to assemble due to the overly large event count
mismatch.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5614500..d18805f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1975,17 +1975,14 @@ repeat:
 		/* otherwise we have to go forward and ... */
 		mddev->events ++;
 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
-			/* .. if the array isn't clean, insist on an odd 'events' */
-			if ((mddev->events&1)==0) {
-				mddev->events++;
+			/* .. if the array isn't clean, an 'even' event must also go
+			 * to spares. */
+			if ((mddev->events&1)==0)
 				nospares = 0;
-			}
 		} else {
-			/* otherwise insist on an even 'events' (for clean states) */
-			if ((mddev->events&1)) {
-				mddev->events++;
+			/* otherwise an 'odd' event must go to spares */
+			if ((mddev->events&1))
 				nospares = 0;
-			}
 		}
 	}
 
-- 
cgit v0.10.2


From 67ac6011db5d2b0c853d573ff474b25c85dfb644 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:06:24 +1000
Subject: md/raid5: allow new reshape modes to be restarted in the middle.

md/raid5 doesn't allow a reshape to restart if it involves writing
over the same part of disk that it would be reading from.
This happens at the beginning of a reshape that increases the number
of devices, at the end of a reshape that decreases the number of
devices, and continuously for a reshape that does not change the
number of devices.

The current code is correct for the "increase number of devices"
case as the critical section at the start is handled by userspace
performing a backup.

It does not work for reducing the number of devices, or the
no-change case.
For 'reducing', we need to invert the test.  For no-change we cannot
really be sure things will be safe, so simply require the array
to be read-only, which is how the user-space code which carefully
starts such arrays works.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2b521ee..b8a22a2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev)
 			   (old_disks-max_degraded));
 		/* here_old is the first stripe that we might need to read
 		 * from */
-		if (here_new >= here_old) {
+		if (mddev->delta_disks == 0) {
+			/* We cannot be sure it is safe to start an in-place
+			 * reshape.  It is only safe if user-space if monitoring
+			 * and taking constant backups.
+			 * mdadm always starts a situation like this in
+			 * readonly mode so it can take control before
+			 * allowing any writes.  So just check for that.
+			 */
+			if ((here_new * mddev->new_chunk_sectors != 
+			     here_old * mddev->chunk_sectors) ||
+			    mddev->ro == 0) {
+				printk(KERN_ERR "raid5: in-place reshape must be started"
+				       " in read-only mode - aborting\n");
+				return -EINVAL;
+			}
+		} else if (mddev->delta_disks < 0
+		    ? (here_new * mddev->new_chunk_sectors <=
+		       here_old * mddev->chunk_sectors)
+		    : (here_new * mddev->new_chunk_sectors >=
+		       here_old * mddev->chunk_sectors)) {
 			/* Reading from the same stripe as writing to - bad */
 			printk(KERN_ERR "raid5: reshape_position too early for "
 			       "auto-recovery - aborting.\n");
-- 
cgit v0.10.2


From a639755cf885e437b2fe4168d35157fa90d530ab Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:13:00 +1000
Subject: md/raid5: make sure a reshape restarts at the correct address.

This "if" don't allow for the possibility that the number of devices
doesn't change, and so sector_nr isn't set correctly in that case.
So change '>' to '>='.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8a22a2..94a74cb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
 			sector_nr = raid5_size(mddev, 0, 0)
 				- conf->reshape_progress;
-		} else if (mddev->delta_disks > 0 &&
+		} else if (mddev->delta_disks >= 0 &&
 			   conf->reshape_progress > 0)
 			sector_nr = conf->reshape_progress;
 		sector_div(sector_nr, new_data_disks);
-- 
cgit v0.10.2


From 1a67dde0abba36421a1257d01ba9de2f6d1c160a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:41:49 +1000
Subject: md/raid5: Properly remove excess drives after shrinking a raid5/6

We were removing the drives, from the array, but not
removing symlinks from /sys/.... and not marking the device
as having been removed.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 94a74cb..b8a2c5d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5097,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
 					mddev->degraded--;
 			for (d = conf->raid_disks ;
 			     d < conf->raid_disks - mddev->delta_disks;
-			     d++)
-				raid5_remove_disk(mddev, d);
+			     d++) {
+				mdk_rdev_t *rdev = conf->disks[d].rdev;
+				if (rdev && raid5_remove_disk(mddev, d) == 0) {
+					char nm[20];
+					sprintf(nm, "rd%d", rdev->raid_disk);
+					sysfs_remove_link(&mddev->kobj, nm);
+					rdev->raid_disk = -1;
+				}
+			}
 		}
 		mddev->layout = conf->algorithm;
 		mddev->chunk_sectors = conf->chunk_sectors;
-- 
cgit v0.10.2


From 4d484a4a7a5126410eed5f8dd329a33f6eeed068 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:41:50 +1000
Subject: md: allow upper limit for resync/reshape to be set when array is
 read-only

Normally we only allow the upper limit for a reshape to be decreased
when the array not performing a sync/recovery/reshape, otherwise there
could be races.  But if an array is part-way through a reshape when it
is assembled the reshape is started immediately leaving no window
to set an upper bound.

If the array is started read-only, the reshape will be suspended until
the array becomes writable, so that provides a window during which it
is perfectly safe to reduce the upper limit of a reshape.

So: allow the upper limit (sync_max) to be reduced even if the reshape
thread is running, as long as the array is still read-only.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d18805f..103f2d3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3599,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 		if (max < mddev->resync_min)
 			return -EINVAL;
 		if (max < mddev->resync_max &&
+		    mddev->ro == 0 &&
 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 			return -EBUSY;
 
-- 
cgit v0.10.2


From ba9a633787eed1e90d587282642580ad3d44f7fd Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 22 Jul 2009 15:14:29 +0000
Subject: sh: convert processor device setup functions to arch_initcall()

Convert the processor platform device setup
functions from __initcall() and sometimes
device_initcall() to arch_initcall().

This makes sure that the platform devices are
registered a bit earlier so the devices are
available when drivers register using initcall
levels earlier than device_initcall().

A good example is platform devices needed by
i2c-sh_mobile.c which registers a bit earlier
using subsys_initcall().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 1379873..8555c05 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -187,7 +187,7 @@ static int __init sh7619_devices_setup(void)
 	return platform_add_devices(sh7619_devices,
 				    ARRAY_SIZE(sh7619_devices));
 }
-__initcall(sh7619_devices_setup);
+arch_initcall(sh7619_devices_setup);
 
 void __init plat_irq_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 869c2da..b673764 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -238,7 +238,7 @@ static int __init mxg_devices_setup(void)
 	return platform_add_devices(mxg_devices,
 				    ARRAY_SIZE(mxg_devices));
 }
-__initcall(mxg_devices_setup);
+arch_initcall(mxg_devices_setup);
 
 void __init plat_irq_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index d8febe1..fbde5b7 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -357,7 +357,7 @@ static int __init sh7201_devices_setup(void)
 	return platform_add_devices(sh7201_devices,
 				    ARRAY_SIZE(sh7201_devices));
 }
-__initcall(sh7201_devices_setup);
+arch_initcall(sh7201_devices_setup);
 
 void __init plat_irq_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 62e3039..d3fd536 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -367,7 +367,7 @@ static int __init sh7203_devices_setup(void)
 	return platform_add_devices(sh7203_devices,
 				    ARRAY_SIZE(sh7203_devices));
 }
-__initcall(sh7203_devices_setup);
+arch_initcall(sh7203_devices_setup);
 
 void __init plat_irq_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index 3e6f3d7..a9ccc5e 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -338,7 +338,7 @@ static int __init sh7206_devices_setup(void)
 	return platform_add_devices(sh7206_devices,
 				    ARRAY_SIZE(sh7206_devices));
 }
-__initcall(sh7206_devices_setup);
+arch_initcall(sh7206_devices_setup);
 
 void __init plat_irq_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 88f742f..c231059 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -222,7 +222,7 @@ static int __init sh7705_devices_setup(void)
 	return platform_add_devices(sh7705_devices,
 				    ARRAY_SIZE(sh7705_devices));
 }
-__initcall(sh7705_devices_setup);
+arch_initcall(sh7705_devices_setup);
 
 static struct platform_device *sh7705_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index c563067..347ab35 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -250,7 +250,7 @@ static int __init sh770x_devices_setup(void)
 	return platform_add_devices(sh770x_devices,
 		ARRAY_SIZE(sh770x_devices));
 }
-__initcall(sh770x_devices_setup);
+arch_initcall(sh770x_devices_setup);
 
 static struct platform_device *sh770x_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index efa76c8..717e90a 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -226,7 +226,7 @@ static int __init sh7710_devices_setup(void)
 	return platform_add_devices(sh7710_devices,
 				    ARRAY_SIZE(sh7710_devices));
 }
-__initcall(sh7710_devices_setup);
+arch_initcall(sh7710_devices_setup);
 
 static struct platform_device *sh7710_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index 5b21077..74d8baa 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -388,7 +388,7 @@ static int __init sh7720_devices_setup(void)
 	return platform_add_devices(sh7720_devices,
 				    ARRAY_SIZE(sh7720_devices));
 }
-__initcall(sh7720_devices_setup);
+arch_initcall(sh7720_devices_setup);
 
 static struct platform_device *sh7720_early_devices[] __initdata = {
 	&cmt0_device,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 6d088d1..de4827d 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -138,7 +138,7 @@ static int __init sh4202_devices_setup(void)
 	return platform_add_devices(sh4202_devices,
 				    ARRAY_SIZE(sh4202_devices));
 }
-__initcall(sh4202_devices_setup);
+arch_initcall(sh4202_devices_setup);
 
 static struct platform_device *sh4202_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index 851672d..1b8b122 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -239,7 +239,7 @@ static int __init sh7750_devices_setup(void)
 	return platform_add_devices(sh7750_devices,
 				    ARRAY_SIZE(sh7750_devices));
 }
-__initcall(sh7750_devices_setup);
+arch_initcall(sh7750_devices_setup);
 
 static struct platform_device *sh7750_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index 5b82251..7fbb7be 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -265,7 +265,7 @@ static int __init sh7760_devices_setup(void)
 	return platform_add_devices(sh7760_devices,
 				    ARRAY_SIZE(sh7760_devices));
 }
-__initcall(sh7760_devices_setup);
+arch_initcall(sh7760_devices_setup);
 
 static struct platform_device *sh7760_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index 6307e08..ac4d567 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -325,7 +325,7 @@ static int __init sh7343_devices_setup(void)
 	return platform_add_devices(sh7343_devices,
 				    ARRAY_SIZE(sh7343_devices));
 }
-__initcall(sh7343_devices_setup);
+arch_initcall(sh7343_devices_setup);
 
 static struct platform_device *sh7343_early_devices[] __initdata = {
 	&cmt_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index c18f7d0..1a956b1 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -318,7 +318,7 @@ static int __init sh7366_devices_setup(void)
 	return platform_add_devices(sh7366_devices,
 				    ARRAY_SIZE(sh7366_devices));
 }
-__initcall(sh7366_devices_setup);
+arch_initcall(sh7366_devices_setup);
 
 static struct platform_device *sh7366_early_devices[] __initdata = {
 	&cmt_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index ea524a2..cda76eb 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -359,7 +359,7 @@ static int __init sh7722_devices_setup(void)
 	return platform_add_devices(sh7722_devices,
 				    ARRAY_SIZE(sh7722_devices));
 }
-__initcall(sh7722_devices_setup);
+arch_initcall(sh7722_devices_setup);
 
 static struct platform_device *sh7722_early_devices[] __initdata = {
 	&cmt_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index e1bb80b..b45dace 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -473,7 +473,7 @@ static int __init sh7723_devices_setup(void)
 	return platform_add_devices(sh7723_devices,
 				    ARRAY_SIZE(sh7723_devices));
 }
-__initcall(sh7723_devices_setup);
+arch_initcall(sh7723_devices_setup);
 
 static struct platform_device *sh7723_early_devices[] __initdata = {
 	&cmt_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index e5ac9eb..a04edaa 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -508,7 +508,7 @@ static int __init sh7724_devices_setup(void)
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
 }
-device_initcall(sh7724_devices_setup);
+arch_initcall(sh7724_devices_setup);
 
 static struct platform_device *sh7724_early_devices[] __initdata = {
 	&cmt_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index f1e0c0d..4659fff 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -314,7 +314,7 @@ static int __init sh7763_devices_setup(void)
 	return platform_add_devices(sh7763_devices,
 				    ARRAY_SIZE(sh7763_devices));
 }
-__initcall(sh7763_devices_setup);
+arch_initcall(sh7763_devices_setup);
 
 static struct platform_device *sh7763_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index 1e86209..eead08d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -368,7 +368,7 @@ static int __init sh7770_devices_setup(void)
 	return platform_add_devices(sh7770_devices,
 				    ARRAY_SIZE(sh7770_devices));
 }
-__initcall(sh7770_devices_setup);
+arch_initcall(sh7770_devices_setup);
 
 static struct platform_device *sh7770_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index 715e05b..2c901f4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -256,7 +256,7 @@ static int __init sh7780_devices_setup(void)
 	return platform_add_devices(sh7780_devices,
 				    ARRAY_SIZE(sh7780_devices));
 }
-__initcall(sh7780_devices_setup);
+arch_initcall(sh7780_devices_setup);
 
 static struct platform_device *sh7780_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index af56140..7f6c718 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -263,7 +263,7 @@ static int __init sh7785_devices_setup(void)
 	return platform_add_devices(sh7785_devices,
 				    ARRAY_SIZE(sh7785_devices));
 }
-__initcall(sh7785_devices_setup);
+arch_initcall(sh7785_devices_setup);
 
 static struct platform_device *sh7785_early_devices[] __initdata = {
 	&tmu0_device,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index b700494..0104a8e 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -547,7 +547,7 @@ static int __init sh7786_devices_setup(void)
 	return platform_add_devices(sh7786_devices,
 				    ARRAY_SIZE(sh7786_devices));
 }
-device_initcall(sh7786_devices_setup);
+arch_initcall(sh7786_devices_setup);
 
 void __init plat_early_device_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index 53c65fd..07f0789 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -256,7 +256,7 @@ static int __init shx3_devices_setup(void)
 	return platform_add_devices(shx3_devices,
 				    ARRAY_SIZE(shx3_devices));
 }
-__initcall(shx3_devices_setup);
+arch_initcall(shx3_devices_setup);
 
 void __init plat_early_device_setup(void)
 {
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index f5ff1ac..6a0f82f 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -186,7 +186,7 @@ static int __init sh5_devices_setup(void)
 	return platform_add_devices(sh5_devices,
 				    ARRAY_SIZE(sh5_devices));
 }
-__initcall(sh5_devices_setup);
+arch_initcall(sh5_devices_setup);
 
 void __init plat_early_device_setup(void)
 {
-- 
cgit v0.10.2


From ba3a17019181af5d6ab898760620c4e9916396c2 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 13 Aug 2009 11:39:02 +0900
Subject: sh: fix i2c init order on Migo-R V2

Convert the Migo-R board code to register devices at
arch_initcall() time instead of __initcall(). This fix
unbreaks migor_ts touch screen driver support.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index f70f464..f9b2e4d 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -608,7 +608,7 @@ static int __init migor_devices_setup(void)
 
 	return platform_add_devices(migor_devices, ARRAY_SIZE(migor_devices));
 }
-__initcall(migor_devices_setup);
+arch_initcall(migor_devices_setup);
 
 /* Return the board specific boot mode pin configuration */
 static int migor_mode_pins(void)
-- 
cgit v0.10.2


From dbefd606a3b3634799b625f4900336e61c89e868 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 7 Aug 2009 03:52:18 +0000
Subject: sh: fix i2c init order on ap325rxa V2

Convert the AP325RXA board code to register devices at
arch_initcall() time instead of device_initcall(). This
fix unbreaks pcf8563 RTC driver support.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c
index 7ffd1b43..b9c88cc 100644
--- a/arch/sh/boards/board-ap325rxa.c
+++ b/arch/sh/boards/board-ap325rxa.c
@@ -547,7 +547,7 @@ static int __init ap325rxa_devices_setup(void)
 	return platform_add_devices(ap325rxa_devices,
 				ARRAY_SIZE(ap325rxa_devices));
 }
-device_initcall(ap325rxa_devices_setup);
+arch_initcall(ap325rxa_devices_setup);
 
 /* Return the board specific boot mode pin configuration */
 static int ap325rxa_mode_pins(void)
-- 
cgit v0.10.2


From 8f7a0dc51674ad0dd06155291b0aed60d655943c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Aug 2009 14:44:59 -0300
Subject: perf list: Fix large list output by using the pager

When /sys/kernel/debug is mounted the list can be imense, so
use the pager like the other tools.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090812174459.GB3495@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index f990fa8..d88c696 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -10,11 +10,12 @@
 
 #include "perf.h"
 
-#include "util/parse-options.h"
 #include "util/parse-events.h"
+#include "util/cache.h"
 
 int cmd_list(int argc __used, const char **argv __used, const char *prefix __used)
 {
+	setup_pager();
 	print_events();
 	return 0;
 }
-- 
cgit v0.10.2


From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 Aug 2009 10:13:22 +0200
Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs

Provide weak aliases for hw_perf_counter_setup_online(). This is
used by the BTS patches (for v2.6.32), but it interacts with
fixes so propagate this upstream. (it has no effect as of yet)

Also export perf_counter_output() to architecture code.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a9d823a..2b36afe 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -694,6 +694,8 @@ struct perf_sample_data {
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
 				 struct perf_sample_data *data);
+extern void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data);
 
 /*
  * Return 1 for a software counter, 0 for a hardware counter
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b0b20a0..e26d2fc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -88,6 +88,7 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }
 
 int __weak
 hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter, int nmi,
+void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
 	int ret;
@@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_counter_init_cpu(cpu);
 		break;
 
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hw_perf_counter_setup_online(cpu);
+		break;
+
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		perf_counter_exit_cpu(cpu);
@@ -4616,6 +4622,8 @@ void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
 }
 
-- 
cgit v0.10.2


From 3a9f131fb00b8ac5950a11ad1599e45edfb5ae44 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 Aug 2009 10:27:18 +0200
Subject: perf tools: Add a per tracepoint counter attribute to get raw sample

Add a new flag field while opening a tracepoint perf counter:

	-e tracepoint_subsystem:tracepoint_name:flags

This is intended to be generic although for now it only supports the
r[e[c[o[r[d]]]]] flag:

	./perf record -e workqueue:workqueue_insertion:record
	./perf record -e workqueue:workqueue_insertion:r

will have the same effect: enabling the raw samples record for
the given tracepoint counter.

In the future, we may want to support further flags, separated
by commas.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1250152039-7284-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index de76008..78adc47 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -398,7 +398,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
 				  PERF_FORMAT_ID;
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
 	if (freq) {
 		attr->sample_type	|= PERF_SAMPLE_PERIOD;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4858d83..0441784 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -379,6 +379,7 @@ static int parse_tracepoint_event(const char **strp,
 				    struct perf_counter_attr *attr)
 {
 	const char *evt_name;
+	char *flags;
 	char sys_name[MAX_EVENT_LENGTH];
 	char id_buf[4];
 	int fd;
@@ -400,6 +401,15 @@ static int parse_tracepoint_event(const char **strp,
 	strncpy(sys_name, *strp, sys_length);
 	sys_name[sys_length] = '\0';
 	evt_name = evt_name + 1;
+
+	flags = strchr(evt_name, ':');
+	if (flags) {
+		*flags = '\0';
+		flags++;
+		if (!strncmp(flags, "record", strlen(flags)))
+			attr->sample_type |= PERF_SAMPLE_RAW;
+	}
+
 	evt_length = strlen(evt_name);
 	if (evt_length >= MAX_EVENT_LENGTH)
 		return 0;
-- 
cgit v0.10.2


From daac07b2e6b77f1bd44104aa2f0593e5505f27d4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 Aug 2009 10:27:19 +0200
Subject: perf tools: Add a general option to enable raw sample records

While we can enable the perf sample records per tracepoint
counter, we may also want to enable this option for every
tracepoint counters to open, so that we don't need to add a
:record flag for all of them.

Add the -R, --raw-samples options for this purpose.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1250152039-7284-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 78adc47..3d051b9 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -34,6 +34,7 @@ static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
+static int			raw_samples			= 0;
 static int			system_wide			= 0;
 static int			profile_cpu			= -1;
 static pid_t			target_pid			= -1;
@@ -418,6 +419,8 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
+	if (raw_samples)
+		attr->sample_type	|= PERF_SAMPLE_RAW;
 
 	attr->mmap		= track;
 	attr->comm		= track;
@@ -650,6 +653,8 @@ static const struct option options[] = {
 		    "record events on existing pid"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
+		    "collect raw sample records from all opened counters"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('A', "append", &append_file,
-- 
cgit v0.10.2


From 8fd101f20bdf771949a8f3a5a779877d09b2fb56 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Aug 2009 18:19:57 -0300
Subject: perf report: Don't show unresolved DSOs and symbols when -S/-d is
 used

We're interested in just those symbols/DSOs, so filter out the
unresolved ones.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090812211957.GE3495@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 23e1457..b53a60f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1590,10 +1590,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 	if (show & show_mask) {
 		struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip);
 
-		if (dso_list && dso && dso->name && !strlist__has_entry(dso_list, dso->name))
+		if (dso_list && (!dso || !dso->name ||
+				 !strlist__has_entry(dso_list, dso->name)))
 			return 0;
 
-		if (sym_list && sym && !strlist__has_entry(sym_list, sym->name))
+		if (sym_list && (!sym || !strlist__has_entry(sym_list, sym->name)))
 			return 0;
 
 		if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) {
-- 
cgit v0.10.2


From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2009 09:51:55 +0200
Subject: perf_counter: Fix swcounter context invariance

perf_swcounter_is_counting() uses a lock, which means we cannot
use swcounters from NMI or when holding that particular lock,
this is unintended.

The below removes the lock, this opens up race window, but not
worse than the swcounters already experience due to RCU
traversal of the context in perf_swcounter_ctx_event().

This also fixes the hard lockups while opening a lockdep
tracepoint counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
LKML-Reference: <1250149915.10001.66.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e26d2fc..3dd4339 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
 {
-	struct perf_counter_context *ctx;
-	unsigned long flags;
-	int count;
-
+	/*
+	 * The counter is active, we're good!
+	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		return 1;
 
+	/*
+	 * The counter is off/error, not counting.
+	 */
 	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
 		return 0;
 
 	/*
-	 * If the counter is inactive, it could be just because
-	 * its task is scheduled out, or because it's in a group
-	 * which could not go on the PMU.  We want to count in
-	 * the first case but not the second.  If the context is
-	 * currently active then an inactive software counter must
-	 * be the second case.  If it's not currently active then
-	 * we need to know whether the counter was active when the
-	 * context was last active, which we can determine by
-	 * comparing counter->tstamp_stopped with ctx->time.
-	 *
-	 * We are within an RCU read-side critical section,
-	 * which protects the existence of *ctx.
+	 * The counter is inactive, if the context is active
+	 * we're part of a group that didn't make it on the 'pmu',
+	 * not counting.
 	 */
-	ctx = counter->ctx;
-	spin_lock_irqsave(&ctx->lock, flags);
-	count = 1;
-	/* Re-check state now we have the lock */
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-	    counter->ctx->is_active ||
-	    counter->tstamp_stopped < ctx->time)
-		count = 0;
-	spin_unlock_irqrestore(&ctx->lock, flags);
-	return count;
+	if (counter->ctx->is_active)
+		return 0;
+
+	/*
+	 * We're inactive and the context is too, this means the
+	 * task is scheduled out, we're counting events that happen
+	 * to us, like migration events.
+	 */
+	return 1;
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v0.10.2


From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 13 Aug 2009 11:47:53 +0200
Subject: perf: Rework/fix the whole read vs group stuff

Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
PERF_FORMAT_GROUP to deal with group reads in a more generic
way.

This allows you to get group reads out of read() as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.117411814@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b36afe..b53f700 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -115,7 +115,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_TID				= 1U << 1,
 	PERF_SAMPLE_TIME			= 1U << 2,
 	PERF_SAMPLE_ADDR			= 1U << 3,
-	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_READ			= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
@@ -127,16 +127,32 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in attr.read_format to request that
- * reads on the counter should return the indicated quantities,
- * in increasing order of bit value, after the counter value.
+ * The format of the data returned by read() on a perf counter fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ * 	{ u64		value;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		id;           } && PERF_FORMAT_ID
+ * 	} && !PERF_FORMAT_GROUP
+ *
+ * 	{ u64		nr;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		value;
+ * 	    { u64	id;           } && PERF_FORMAT_ID
+ * 	  }		cntr[nr];
+ * 	} && PERF_FORMAT_GROUP
+ * };
  */
 enum perf_counter_read_format {
 	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
 
-	PERF_FORMAT_MAX = 1U << 3, 		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
@@ -343,10 +359,8 @@ enum perf_event_type {
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 * 	u32				pid, tid;
-	 * 	u64				value;
-	 * 	{ u64		time_enabled; 	} && PERF_FORMAT_ENABLED
-	 * 	{ u64		time_running; 	} && PERF_FORMAT_RUNNING
-	 * 	{ u64		parent_id;	} && PERF_FORMAT_ID
+	 *
+	 * 	struct read_format		values;
 	 * };
 	 */
 	PERF_EVENT_READ			= 8,
@@ -364,11 +378,22 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
-	 *	{ u64			nr;
-	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 * 	#
+	 * 	# The RAW record below is opaque data wrt the ABI
+	 * 	#
+	 * 	# That is, the ABI doesn't make any promises wrt to
+	 * 	# the stability of its content, it may vary depending
+	 * 	# on event, hardware, kernel version and phase of
+	 * 	# the moon.
+	 * 	#
+	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 * 	#
+	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 * };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3dd4339..b8c6b97 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static u64 perf_counter_read_tree(struct perf_counter *counter)
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += counter->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
 {
 	struct perf_counter *child;
 	u64 total = 0;
@@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter)
 	return total;
 }
 
+static int perf_counter_read_entry(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_counter_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		err = perf_counter_read_entry(counter, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[4];
-	int n;
+	u64 read_format = counter->attr.read_format;
+	int ret;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	if (count < perf_counter_read_size(counter))
+		return -ENOSPC;
+
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
-	values[0] = perf_counter_read_tree(counter);
-	n = 1;
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_counter_read_group(counter, read_format, buf);
+	else
+		ret = perf_counter_read_one(counter, read_format, buf);
 	mutex_unlock(&counter->child_mutex);
 
-	if (count < n * sizeof(u64))
-		return -EINVAL;
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
+	return ret;
 }
 
 static ssize_t
@@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_counter *counter)
+{
+	u64 read_format = counter->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&counter->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_counter *counter)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	u64 read_format = counter->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != counter)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		n = 0;
+
+		if (sub != counter)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_counter_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_counter *counter)
+{
+	if (counter->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, counter);
+	else
+		perf_output_read_one(handle, counter);
+}
+
 void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
@@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
-	struct {
-		u64 id;
-		u64 counter;
-	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
@@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
 
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.size += sizeof(u64) +
-			counter->nr_siblings * sizeof(group_entry);
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(data->regs);
@@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		perf_output_put(&handle, data->period);
 
-	/*
-	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-	 */
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		struct perf_counter *leader, *sub;
-		u64 nr = counter->nr_siblings;
-
-		perf_output_put(&handle, nr);
-
-		leader = counter->group_leader;
-		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-			if (sub != counter)
-				sub->pmu->read(sub);
-
-			group_entry.id = primary_counter_id(sub);
-			group_entry.counter = atomic64_read(&sub->count);
-
-			perf_output_put(&handle, group_entry);
-		}
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(&handle, counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		if (callchain)
@@ -2818,8 +2964,6 @@ struct perf_read_event {
 
 	u32				pid;
 	u32				tid;
-	u64				value;
-	u64				format[3];
 };
 
 static void
@@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter,
 		.header = {
 			.type = PERF_EVENT_READ,
 			.misc = 0,
-			.size = sizeof(event) - sizeof(event.format),
+			.size = sizeof(event) + perf_counter_read_size(counter),
 		},
 		.pid = perf_counter_pid(counter, task),
 		.tid = perf_counter_tid(counter, task),
-		.value = atomic64_read(&counter->count),
 	};
-	int ret, i = 0;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_enabled;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_running;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = primary_counter_id(counter);
-	}
+	int ret;
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
 	if (ret)
 		return;
 
-	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_put(&handle, event);
+	perf_output_read(&handle, counter);
+
 	perf_output_end(&handle);
 }
 
@@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
 	 */
-	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
 	switch (attr->type) {
-- 
cgit v0.10.2


From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 13 Aug 2009 11:47:54 +0200
Subject: perf_counter: Fix an ipi-deadlock

perf_pending_counter() is called from IRQ context and will call
perf_counter_disable(), however perf_counter_disable() uses
smp_call_function_single() which doesn't fancy being used with
IRQs disabled due to IPI deadlocks.

Fix this by making it use the local __perf_counter_disable()
call and teaching the counter_sched_out() code about pending
disables as well.

This should cover the case where a counter migrates before the
pending queue gets processed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.244097721@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b8c6b97..3f841be 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 	counter->tstamp_stopped = ctx->time;
 	counter->pmu->disable(counter);
 	counter->oncpu = -1;
@@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
 
 	if (counter->pending_disable) {
 		counter->pending_disable = 0;
-		perf_counter_disable(counter);
+		__perf_counter_disable(counter);
 	}
 
 	if (counter->pending_wakeup) {
-- 
cgit v0.10.2


From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2009 16:14:42 +0200
Subject: perf_counter: Report the cloning task as parent on
 perf_counter_fork()

A bug in (9f498cc: perf_counter: Full task tracing) makes
profiling multi-threaded apps it go belly up.

[ output as: (PID:TID):(PPID:PTID) ]

 # ./perf report -D | grep FORK
0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236)
0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236)
0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236)
0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236)
0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236)

Shows us that the test (27d028d perf report: Update for the new
FORK/EXIT events) in builtin-report.c:

        /*
         * A thread clone will have the same PID for both
         * parent and child.
         */
        if (thread == parent)
                return 0;

Will clearly fail.

The problem is that perf_counter_fork() reports the actual
parent, instead of the cloning thread.

Fixing that (with the below patch), yields:

 # ./perf report -D | grep FORK
0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589)
0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590)
0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590)
0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590)
0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590)

Which both makes more sense and doesn't confuse perf report
anymore.

Reported-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: paulus@samba.org
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <1250172882.5241.62.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3f841be..534e20d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
 		return;
 
 	task_event->event.pid = perf_counter_pid(counter, task);
-	task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+	task_event->event.ppid = perf_counter_pid(counter, current);
 
 	task_event->event.tid = perf_counter_tid(counter, task);
-	task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
+	task_event->event.ptid = perf_counter_tid(counter, current);
 
 	perf_output_put(&handle, task_event->event);
 	perf_output_end(&handle);
-- 
cgit v0.10.2


From e694958388c50148389b0e9b9e9e8945cf0f1b98 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Aug 2009 08:28:36 -0700
Subject: Make sock_sendpage() use kernel_sendpage()

kernel_sendpage() does the proper default case handling for when the
socket doesn't have a native sendpage implementation.

Now, arguably this might be something that we could instead solve by
just specifying that all protocols should do it themselves at the
protocol level, but we really only care about the common protocols.
Does anybody really care about sendpage on something like Appletalk? Not
likely.

Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Julien TINNES <julien@cr0.org>
Acked-by: Tavis Ormandy <taviso@sdf.lonestar.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/net/socket.c b/net/socket.c
index 791d71a..6d47165 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -736,7 +736,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 	if (more)
 		flags |= MSG_MORE;
 
-	return sock->ops->sendpage(sock, page, offset, size, flags);
+	return kernel_sendpage(sock, page, offset, size, flags);
 }
 
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
-- 
cgit v0.10.2


From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Aug 2009 13:05:10 -0700
Subject: genirq: prevent wakeup of freed irq thread

free_irq() can remove an irqaction while the corresponding interrupt
is in progress, but free_irq() sets action->thread to NULL
unconditionally, which might lead to a NULL pointer dereference in
handle_IRQ_event() when the hard interrupt context tries to wake up
the handler thread.

Prevent this by moving the thread stop after synchronize_irq(). No
need to set action->thread to NULL either as action is going to be
freed anyway.

This fixes a boot crash reported against preempt-rt which uses the
mainline irq threads code to implement full irq threading.

[ tglx: removed local irqthread variable ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 61c679d..d222515 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
-	struct task_struct *irqthread;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->chip->disable(irq);
 	}
 
-	irqthread = action->thread;
-	action->thread = NULL;
-
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
-	if (irqthread) {
-		if (!test_bit(IRQTF_DIED, &action->thread_flags))
-			kthread_stop(irqthread);
-		put_task_struct(irqthread);
-	}
-
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		local_irq_restore(flags);
 	}
 #endif
+
+	if (action->thread) {
+		if (!test_bit(IRQTF_DIED, &action->thread_flags))
+			kthread_stop(action->thread);
+		put_task_struct(action->thread);
+	}
+
 	return action;
 }
 
-- 
cgit v0.10.2


From 64f1607ffbbc772685733ea63e6f7f4183df1b16 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Aug 2009 15:43:34 -0700
Subject: Linux 2.6.31-rc6


diff --git a/Makefile b/Makefile
index 0d46615..abcfa85 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 31
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
 NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2


From 9f162d2a810b4db48f7b8d7e734d0932c81ec2a1 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:18:44 +0300
Subject: sunrpc: hton -> cpu_to_be*

htonl is already defined as cpu_to_be32.
cpu_to_be64 has architecture specific optimized implementations.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index b99c625..f94bbdc 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -117,9 +117,8 @@ static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int le
 static inline __be32 *
 xdr_encode_hyper(__be32 *p, __u64 val)
 {
-	*p++ = htonl(val >> 32);
-	*p++ = htonl(val & 0xFFFFFFFF);
-	return p;
+	*(__be64 *)p = cpu_to_be64(val);
+	return p + 2;
 }
 
 static inline __be32 *
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 406e26d..0d05d25 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -24,7 +24,7 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
 	unsigned int	quadlen = XDR_QUADLEN(obj->len);
 
 	p[quadlen] = 0;		/* zero trailing bytes */
-	*p++ = htonl(obj->len);
+	*p++ = cpu_to_be32(obj->len);
 	memcpy(p, obj->data, obj->len);
 	return p + XDR_QUADLEN(obj->len);
 }
@@ -83,7 +83,7 @@ EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed);
  */
 __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes)
 {
-	*p++ = htonl(nbytes);
+	*p++ = cpu_to_be32(nbytes);
 	return xdr_encode_opaque_fixed(p, ptr, nbytes);
 }
 EXPORT_SYMBOL_GPL(xdr_encode_opaque);
@@ -779,7 +779,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_word);
 int
 xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 {
-	__be32	raw = htonl(obj);
+	__be32	raw = cpu_to_be32(obj);
 
 	return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
 }
-- 
cgit v0.10.2


From 98866b5abe1513cdacc011874ca045d40002eccd Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:18:49 +0300
Subject: sunrpc: ntoh -> be*_to_cpu

ntohl is already defined as be32_to_cpu.
be64_to_cpu has architecture specific optimized implementations.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index f94bbdc..7da466b 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -124,9 +124,8 @@ xdr_encode_hyper(__be32 *p, __u64 val)
 static inline __be32 *
 xdr_decode_hyper(__be32 *p, __u64 *valp)
 {
-	*valp  = ((__u64) ntohl(*p++)) << 32;
-	*valp |= ntohl(*p++);
-	return p;
+	*valp = be64_to_cpup((__be64 *)p);
+	return p + 2;
 }
 
 /*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 0d05d25..8bd690c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -35,7 +35,7 @@ xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
 {
 	unsigned int	len;
 
-	if ((len = ntohl(*p++)) > XDR_MAX_NETOBJ)
+	if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ)
 		return NULL;
 	obj->len  = len;
 	obj->data = (u8 *) p;
@@ -101,7 +101,7 @@ xdr_decode_string_inplace(__be32 *p, char **sp,
 {
 	u32 len;
 
-	len = ntohl(*p++);
+	len = be32_to_cpu(*p++);
 	if (len > maxlen)
 		return NULL;
 	*lenp = len;
@@ -771,7 +771,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
 	status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
 	if (status)
 		return status;
-	*obj = ntohl(raw);
+	*obj = be32_to_cpu(raw);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xdr_decode_word);
-- 
cgit v0.10.2


From e75bc1c89e0c7dda0b140408ddee2ffaef7ba6d4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:18:54 +0300
Subject: nfs: nfs4xdr: get rid of WRITE32

s/WRITE32/*p++ = cpu_to_be32/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e..9a03b24 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,7 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {				\
 	*p++ = htonl((uint32_t)((n) >> 32));				\
 	*p++ = htonl((uint32_t)(n));					\
@@ -750,11 +749,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
-	WRITE32(hdr->taglen);
+	*p++ = cpu_to_be32(hdr->taglen);
 	WRITEMEM(hdr->tag, hdr->taglen);
-	WRITE32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	WRITE32(hdr->nops);
+	*p++ = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -835,7 +834,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	 * We write the bitmap length now, but leave the bitmap and the attribute
 	 * buffer length to be backfilled at the end of this routine.
 	 */
-	WRITE32(2);
+	*p++ = cpu_to_be32(2);
 	q = p;
 	p += 3;
 
@@ -845,39 +844,39 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
-		WRITE32(iap->ia_mode & S_IALLUGO);
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		WRITE32(owner_namelen);
+		*p++ = cpu_to_be32(owner_namelen);
 		WRITEMEM(owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		WRITE32(owner_grouplen);
+		*p++ = cpu_to_be32(owner_grouplen);
 		WRITEMEM(owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_ATIME) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 	if (iap->ia_valid & ATTR_MTIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_MTIME) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 
 	/*
@@ -901,8 +900,8 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_ACCESS);
-	WRITE32(access);
+	*p++ = cpu_to_be32(OP_ACCESS);
+	*p++ = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -912,8 +911,8 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
-	WRITE32(OP_CLOSE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_CLOSE);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
@@ -924,9 +923,9 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_COMMIT);
+	*p++ = cpu_to_be32(OP_COMMIT);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -936,20 +935,20 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_CREATE);
-	WRITE32(create->ftype);
+	*p++ = cpu_to_be32(OP_CREATE);
+	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		RESERVE_SPACE(4);
-		WRITE32(create->u.symlink.len);
+		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		RESERVE_SPACE(8);
-		WRITE32(create->u.device.specdata1);
-		WRITE32(create->u.device.specdata2);
+		*p++ = cpu_to_be32(create->u.device.specdata1);
+		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -957,7 +956,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	RESERVE_SPACE(4 + create->name->len);
-	WRITE32(create->name->len);
+	*p++ = cpu_to_be32(create->name->len);
 	WRITEMEM(create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
@@ -970,9 +969,9 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_GETATTR);
-	WRITE32(1);
-	WRITE32(bitmap);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -982,10 +981,10 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_GETATTR);
-	WRITE32(2);
-	WRITE32(bm0);
-	WRITE32(bm1);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(bm0);
+	*p++ = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -1013,7 +1012,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_GETFH);
+	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1023,8 +1022,8 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_LINK);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_LINK);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
@@ -1053,26 +1052,26 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	__be32 *p;
 
 	RESERVE_SPACE(32);
-	WRITE32(OP_LOCK);
-	WRITE32(nfs4_lock_type(args->fl, args->block));
-	WRITE32(args->reclaim);
+	*p++ = cpu_to_be32(OP_LOCK);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+	*p++ = cpu_to_be32(args->reclaim);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
-	WRITE32(args->new_lock_owner);
+	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
-		WRITE32(args->open_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		WRITE64(args->lock_owner.clientid);
-		WRITE32(16);
+		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
 		WRITE64(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
 		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1083,12 +1082,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(52);
-	WRITE32(OP_LOCKT);
-	WRITE32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(OP_LOCKT);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE64(args->lock_owner.clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
 	WRITE64(args->lock_owner.id);
 	hdr->nops++;
@@ -1100,9 +1099,9 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
-	WRITE32(OP_LOCKU);
-	WRITE32(nfs4_lock_type(args->fl, 0));
-	WRITE32(args->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_LOCKU);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
@@ -1116,8 +1115,8 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_LOOKUP);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_LOOKUP);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
@@ -1130,18 +1129,18 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	RESERVE_SPACE(8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
-		WRITE32(NFS4_SHARE_ACCESS_READ);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
 		break;
 	case FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_WRITE);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
 		break;
 	case FMODE_READ|FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_BOTH);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
 		break;
 	default:
-		WRITE32(0);
+		*p++ = cpu_to_be32(0);
 	}
-	WRITE32(0);		/* for linux, share_deny = 0 always */
+	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,12 +1151,12 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * owner 4 = 32
  */
 	RESERVE_SPACE(8);
-	WRITE32(OP_OPEN);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_OPEN);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
 	WRITE64(arg->clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
 	WRITE64(arg->id);
 }
@@ -1169,11 +1168,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	RESERVE_SPACE(4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		WRITE32(NFS4_CREATE_UNCHECKED);
+		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		WRITE32(NFS4_CREATE_EXCLUSIVE);
+		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1185,11 +1184,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	RESERVE_SPACE(4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		WRITE32(NFS4_OPEN_NOCREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		WRITE32(NFS4_OPEN_CREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1201,13 +1200,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	RESERVE_SPACE(4);
 	switch (delegation_type) {
 	case 0:
-		WRITE32(NFS4_OPEN_DELEGATE_NONE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_READ);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1219,7 +1218,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_NULL);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1228,7 +1227,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1237,7 +1236,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1268,9 +1267,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_CONFIRM);
+	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1280,9 +1279,9 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_DOWNGRADE);
+	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1295,8 +1294,8 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_PUTFH);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_PUTFH);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
@@ -1307,7 +1306,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_PUTROOTFH);
+	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1330,13 +1329,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READ);
+	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1350,19 +1349,19 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
-	WRITE32(OP_READDIR);
+	*p++ = cpu_to_be32(OP_READDIR);
 	WRITE64(readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
-	WRITE32(readdir->count >> 1);  /* We're not doing readdirplus */
-	WRITE32(readdir->count);
-	WRITE32(2);
+	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(2);
 	/* Switch to mounted_on_fileid if the server supports it */
 	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
 		attrs[0] &= ~FATTR4_WORD0_FILEID;
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-	WRITE32(attrs[0] & readdir->bitmask[0]);
-	WRITE32(attrs[1] & readdir->bitmask[1]);
+	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1379,7 +1378,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READLINK);
+	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1389,8 +1388,8 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_REMOVE);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_REMOVE);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
@@ -1401,12 +1400,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	__be32 *p;
 
 	RESERVE_SPACE(8 + oldname->len);
-	WRITE32(OP_RENAME);
-	WRITE32(oldname->len);
+	*p++ = cpu_to_be32(OP_RENAME);
+	*p++ = cpu_to_be32(oldname->len);
 	WRITEMEM(oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
-	WRITE32(newname->len);
+	*p++ = cpu_to_be32(newname->len);
 	WRITEMEM(newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
@@ -1417,7 +1416,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_RENEW);
+	*p++ = cpu_to_be32(OP_RENEW);
 	WRITE64(client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
@@ -1429,7 +1428,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_RESTOREFH);
+	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1440,15 +1439,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
-	WRITE32(1);
-	WRITE32(FATTR4_WORD0_ACL);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	RESERVE_SPACE(4);
-	WRITE32(arg->acl_len);
+	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1461,7 +1460,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SAVEFH);
+	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1471,7 +1470,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
@@ -1483,16 +1482,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	__be32 *p;
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID);
+	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_prog);
+	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_cb_ident);
+	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1502,7 +1501,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	__be32 *p;
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID_CONFIRM);
+	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	WRITE64(client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
@@ -1514,14 +1513,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_WRITE);
+	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
 	WRITE64(args->offset);
-	WRITE32(args->stable);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p++ = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1534,7 +1533,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
-	WRITE32(OP_DELEGRETURN);
+	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
@@ -1549,15 +1548,15 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	__be32 *p;
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
-	WRITE32(OP_EXCHANGE_ID);
+	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	RESERVE_SPACE(12);
-	WRITE32(args->flags);
-	WRITE32(0);	/* zero length state_protect4_a */
-	WRITE32(0);	/* zero length implementation id array */
+	*p++ = cpu_to_be32(args->flags);
+	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
+	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1572,54 +1571,54 @@ static void encode_create_session(struct xdr_stream *xdr,
 	struct nfs_client *clp = args->client;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_CREATE_SESSION);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
 	WRITE64(clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
-	WRITE32(clp->cl_seqid);			/*Sequence id */
-	WRITE32(args->flags);			/*flags */
+	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->fc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->fc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->fc_attrs.max_ops);	/* max operations */
-	WRITE32(args->fc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	/* Back Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->bc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->bc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->bc_attrs.max_ops);	/* max operations */
-	WRITE32(args->bc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->bc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	RESERVE_SPACE(4);
-	WRITE32(args->cb_program);		/* cb_program */
+	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
 
 	RESERVE_SPACE(4);			/* # of security flavors */
-	WRITE32(1);
+	*p++ = cpu_to_be32(1);
 
 	RESERVE_SPACE(4);
-	WRITE32(RPC_AUTH_UNIX);			/* auth_sys */
+	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
 	RESERVE_SPACE(4);
-	WRITE32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
-	WRITE32(len);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(machine_name, len);
-	WRITE32(0);				/* UID */
-	WRITE32(0);				/* GID */
-	WRITE32(0);				/* No more gids */
+	*p++ = cpu_to_be32(0);				/* UID */
+	*p++ = cpu_to_be32(0);				/* GID */
+	*p++ = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1630,7 +1629,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
-	WRITE32(OP_DESTROY_SESSION);
+	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
@@ -1656,7 +1655,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	slot = tp->slots + args->sa_slotid;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SEQUENCE);
+	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
 	 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1672,10 +1671,10 @@ static void encode_sequence(struct xdr_stream *xdr,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(slot->seq_nr);
-	WRITE32(args->sa_slotid);
-	WRITE32(tp->highest_used_slotid);
-	WRITE32(args->sa_cache_this);
+	*p++ = cpu_to_be32(slot->seq_nr);
+	*p++ = cpu_to_be32(args->sa_slotid);
+	*p++ = cpu_to_be32(tp->highest_used_slotid);
+	*p++ = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v0.10.2


From b95be5a976848febff82edb21d5b4351b3997bf6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:01 +0300
Subject: nfs: nfs4xdr: get rid of WRITE64

s/WRITE64/p = xdr_encode_hyper(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9a03b24..5d80766 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE64(n)               do {				\
-	*p++ = htonl((uint32_t)((n) >> 32));				\
-	*p++ = htonl((uint32_t)(n));					\
-} while (0)
 #define WRITEMEM(ptr,nbytes)     do {				\
 	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
 } while (0)
@@ -840,7 +836,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 
 	if (iap->ia_valid & ATTR_SIZE) {
 		bmval0 |= FATTR4_WORD0_SIZE;
-		WRITE64(iap->ia_size);
+		p = xdr_encode_hyper(p, iap->ia_size);
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
@@ -924,7 +920,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 
 	RESERVE_SPACE(16);
 	*p++ = cpu_to_be32(OP_COMMIT);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
@@ -1055,18 +1051,18 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-		WRITE64(args->lock_owner.clientid);
+		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
-		WRITE64(args->lock_owner.id);
+		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
@@ -1084,12 +1080,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	RESERVE_SPACE(52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
-	WRITE64(args->lock_owner.clientid);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
-	WRITE64(args->lock_owner.id);
+	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1103,8 +1099,8 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1155,10 +1151,10 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
-	WRITE64(arg->clientid);
+	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
-	WRITE64(arg->id);
+	p = xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1334,7 +1330,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
@@ -1350,7 +1346,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
-	WRITE64(readdir->cookie);
+	p = xdr_encode_hyper(p, readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
@@ -1417,7 +1413,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	RESERVE_SPACE(12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	WRITE64(client_stateid->cl_clientid);
+	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1502,7 +1498,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-	WRITE64(client_state->cl_clientid);
+	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
@@ -1518,7 +1514,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
 
@@ -1574,7 +1570,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
-	WRITE64(clp->cl_ex_clid);
+	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
-- 
cgit v0.10.2


From 93f0cf25944695e1229fe90a2897af0211fbd425 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:06 +0300
Subject: nfs: nfs4xdr: get rid of WRITEMEM

s/WRITEMEM(/p = xdr_encode_opaque_fixed(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5d80766..17915c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITEMEM(ptr,nbytes)     do {				\
-	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
-} while (0)
-
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = xdr_reserve_space(xdr, nbytes);			\
 	BUG_ON(!p);						\
@@ -746,7 +742,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
 	*p++ = cpu_to_be32(hdr->taglen);
-	WRITEMEM(hdr->tag, hdr->taglen);
+	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p++ = cpu_to_be32(hdr->nops);
@@ -845,12 +841,12 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
 		*p++ = cpu_to_be32(owner_namelen);
-		WRITEMEM(owner_name, owner_namelen);
+		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
 		*p++ = cpu_to_be32(owner_grouplen);
-		WRITEMEM(owner_group, owner_grouplen);
+		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -909,7 +905,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -953,7 +949,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	RESERVE_SPACE(4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	WRITEMEM(create->name->name, create->name->len);
+	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1020,7 +1016,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1057,16 +1053,16 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
-		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
-		WRITEMEM("lock id:", 8);
+		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
-		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
@@ -1084,7 +1080,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("lock id:", 8);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
@@ -1098,7 +1094,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
-	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
@@ -1113,7 +1109,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(name->name, len);
+	p = xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1153,7 +1149,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	RESERVE_SPACE(28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("open id:", 8);
+	p = xdr_encode_opaque_fixed(p, "open id:", 8);
 	p = xdr_encode_hyper(p, arg->id);
 }
 
@@ -1233,7 +1229,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1264,7 +1260,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
@@ -1276,7 +1272,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
@@ -1292,7 +1288,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(fh->data, len);
+	p = xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1315,9 +1311,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	RESERVE_SPACE(NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1347,7 +1343,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
-	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
@@ -1386,7 +1382,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1398,11 +1394,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	RESERVE_SPACE(8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	WRITEMEM(oldname->name, oldname->len);
+	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	WRITEMEM(newname->name, newname->len);
+	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1436,7 +1432,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1467,7 +1463,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1479,7 +1475,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
@@ -1499,7 +1495,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1530,7 +1526,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1545,7 +1541,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
@@ -1611,7 +1607,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(machine_name, len);
+	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p++ = cpu_to_be32(0);				/* No more gids */
@@ -1626,7 +1622,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1666,7 +1662,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-- 
cgit v0.10.2


From 42edd698125b76a38bd9999015202db036dfbc76 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:13 +0300
Subject: nfs: nfs4xdr: optimize RESERVE_SPACE in encode_create_session and
 encode_sequence

Coalesce multilpe constant RESERVE_SPACEs into one

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 17915c8..d460d81 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1562,17 +1562,15 @@ static void encode_create_session(struct xdr_stream *xdr,
 	uint32_t len;
 	struct nfs_client *clp = args->client;
 
-	RESERVE_SPACE(4);
-	*p++ = cpu_to_be32(OP_CREATE_SESSION);
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
 
-	RESERVE_SPACE(8);
+	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
-
-	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
-	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
 	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
@@ -1591,21 +1589,12 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
 	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
-
-	RESERVE_SPACE(4);			/* # of security flavors */
 	*p++ = cpu_to_be32(1);
-
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	len = scnprintf(machine_name, sizeof(machine_name), "%s",
-			clp->cl_ipaddr);
-	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
@@ -1646,7 +1635,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4);
+	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
@@ -1661,7 +1650,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 		((u32 *)session->sess_id.data)[3],
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
-	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
-- 
cgit v0.10.2


From 2220f13a8b90d2259f3094cb54cf4de67d8eee2d Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:18 +0300
Subject: nfs: nfs4xdr: encode_compound_hdr does not have to round up reserved
 bytes

This is already done by xdr_reserve_space and since encode_compound_hdr
is adding a byte count to "12" which is already word aligned, the xdr
level rounding will work just as well.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d460d81..7c2b162 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -740,7 +740,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
+	RESERVE_SPACE(12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
-- 
cgit v0.10.2


From 13c65ce90006badccd5663e558e3c85869ae5ce6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:25 +0300
Subject: nfs: nfs4xdr: change RESERVE_SPACE macro into a static helper

In order to open code and expose the result pointer assignment.

Alternatively, we can open code the call to xdr_reserve_space
and do the BUG_ON an the error case at the call site.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c2b162..3bcde49 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,20 +702,12 @@ struct compound_hdr {
 	u32		minorversion;
 };
 
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define RESERVE_SPACE(nbytes)	do {				\
-	p = xdr_reserve_space(xdr, nbytes);			\
-	BUG_ON(!p);						\
-} while (0)
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr_reserve_space(xdr, nbytes);
+	BUG_ON(!p);
+	return p;
+}
 
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
@@ -740,7 +732,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12 + hdr->taglen);
+	p = reserve_space(xdr, 12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
@@ -820,7 +812,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 16;
 	else if (iap->ia_valid & ATTR_MTIME)
 		len += 4;
-	RESERVE_SPACE(len);
+	p = reserve_space(xdr, len);
 
 	/*
 	 * We write the bitmap length now, but leave the bitmap and the attribute
@@ -891,7 +883,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
 	*p++ = cpu_to_be32(access);
 	hdr->nops++;
@@ -902,7 +894,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
@@ -914,7 +906,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
@@ -926,19 +918,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
 	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
-		RESERVE_SPACE(4);
+		p = reserve_space(xdr, 4);
 		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
-		RESERVE_SPACE(8);
+		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
 		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
@@ -947,7 +939,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	RESERVE_SPACE(4 + create->name->len);
+	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
 	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
@@ -960,7 +952,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(bitmap);
@@ -972,7 +964,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
@@ -1003,7 +995,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
@@ -1013,7 +1005,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1043,7 +1035,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 {
 	__be32 *p;
 
-	RESERVE_SPACE(32);
+	p = reserve_space(xdr, 32);
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
@@ -1051,7 +1043,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
+		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
@@ -1061,7 +1053,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
-		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
+		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
@@ -1073,7 +1065,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(52);
+	p = reserve_space(xdr, 52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	p = xdr_encode_hyper(p, args->fl->fl_start);
@@ -1090,7 +1082,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
+	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
@@ -1106,7 +1098,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	int len = name->len;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, name->name, len);
@@ -1118,7 +1110,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
 		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
@@ -1142,11 +1134,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
  * owner 4 = 32
  */
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
-	RESERVE_SPACE(28);
+	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
@@ -1157,7 +1149,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
@@ -1173,7 +1165,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
@@ -1189,7 +1181,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
@@ -1209,7 +1201,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1218,7 +1210,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
@@ -1227,7 +1219,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
@@ -1258,7 +1250,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1270,7 +1262,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1285,7 +1277,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	int len = fh->size;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, fh->data, len);
@@ -1297,7 +1289,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
@@ -1308,7 +1300,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	nfs4_stateid stateid;
 	__be32 *p;
 
-	RESERVE_SPACE(NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
 		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
@@ -1320,12 +1312,12 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
@@ -1340,7 +1332,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
+	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
 	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
@@ -1369,7 +1361,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
@@ -1379,7 +1371,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1391,12 +1383,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + oldname->len);
+	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
 	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
-	RESERVE_SPACE(4 + newname->len);
+	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
 	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
@@ -1407,7 +1399,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
 	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
@@ -1419,7 +1411,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
@@ -1430,15 +1422,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
-	RESERVE_SPACE(2*4);
+	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
@@ -1451,7 +1443,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
@@ -1461,7 +1453,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
@@ -1473,16 +1465,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
@@ -1492,7 +1484,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
@@ -1504,12 +1496,12 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
@@ -1523,7 +1515,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
@@ -1539,13 +1531,13 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + sizeof(args->verifier->data));
+	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
 	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
@@ -1565,7 +1557,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 
-	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
@@ -1609,7 +1601,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 				   struct compound_hdr *hdr)
 {
 	__be32 *p;
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
@@ -1635,7 +1627,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
-- 
cgit v0.10.2


From 345585132a204859fbb7d8b662e9b6e5b563c6dc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:30 +0300
Subject: nfs: nfs4xdr: optimize low level encoding

do not increment encoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3bcde49..d75f821 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -737,7 +737,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	*p++ = cpu_to_be32(hdr->nops);
+	*p = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -874,7 +874,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	len = (char *)p - (char *)q - 12;
 	*q++ = htonl(bmval0);
 	*q++ = htonl(bmval1);
-	*q++ = htonl(len);
+	*q = htonl(len);
 
 /* out: */
 }
@@ -885,7 +885,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
-	*p++ = cpu_to_be32(access);
+	*p = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -897,7 +897,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -909,7 +909,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -920,19 +920,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
-	*p++ = cpu_to_be32(create->ftype);
+	*p = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		p = reserve_space(xdr, 4);
-		*p++ = cpu_to_be32(create->u.symlink.len);
+		*p = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
-		*p++ = cpu_to_be32(create->u.device.specdata2);
+		*p = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -941,7 +941,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -955,7 +955,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(bitmap);
+	*p = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -968,7 +968,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
-	*p++ = cpu_to_be32(bm1);
+	*p = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -996,7 +996,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_GETFH);
+	*p = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1008,7 +1008,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1041,7 +1041,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(args->reclaim);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-	*p++ = cpu_to_be32(args->new_lock_owner);
+	*p = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
@@ -1050,12 +1050,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-		p = xdr_encode_hyper(p, args->lock_owner.id);
+		xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
-		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+		*p = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1073,7 +1073,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-	p = xdr_encode_hyper(p, args->lock_owner.id);
+	xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1088,7 +1088,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
-	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1101,7 +1101,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1124,7 +1124,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	default:
 		*p++ = cpu_to_be32(0);
 	}
-	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
+	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1136,13 +1136,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  */
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
-	p = xdr_encode_hyper(p, arg->id);
+	xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,11 +1152,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1168,11 +1168,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
+		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
+		*p = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1184,13 +1184,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1202,7 +1202,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1211,7 +1211,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1221,7 +1221,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1253,7 +1253,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1265,7 +1265,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1280,7 +1280,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1290,7 +1290,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_PUTROOTFH);
+	*p = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1303,9 +1303,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1313,13 +1313,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READ);
+	*p = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1345,7 +1345,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1362,7 +1362,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READLINK);
+	*p = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1374,7 +1374,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1386,11 +1386,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
+	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1401,7 +1401,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
+	xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1412,7 +1412,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_RESTOREFH);
+	*p = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1424,14 +1424,14 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
+	*p = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(arg->acl_len);
+	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1444,7 +1444,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_SAVEFH);
+	*p = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1455,7 +1455,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1467,15 +1467,15 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_prog);
+	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1487,7 +1487,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1497,14 +1497,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_WRITE);
+	*p = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1518,7 +1518,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1533,14 +1533,14 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
+	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
-	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
+	*p = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1591,7 +1591,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
-	*p++ = cpu_to_be32(0);				/* No more gids */
+	*p = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1603,7 +1603,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1646,7 +1646,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-	*p++ = cpu_to_be32(args->sa_cache_this);
+	*p = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v0.10.2


From 811652bd6edd66dd35bf9caacdfe96d19f75a47e Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:34 +0300
Subject: nfs: nfs4xdr: merge xdr_encode_int+xdr_encode_opaque_fixed into
 xdr_encode_opaque

use encode_string where appropriate.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d75f821..efa78ad 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -732,9 +732,8 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	p = reserve_space(xdr, 12 + hdr->taglen);
-	*p++ = cpu_to_be32(hdr->taglen);
-	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
+	p = reserve_space(xdr, 4 + hdr->taglen + 8);
+	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p = cpu_to_be32(hdr->nops);
@@ -832,13 +831,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		*p++ = cpu_to_be32(owner_namelen);
-		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
+		p = xdr_encode_opaque(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		*p++ = cpu_to_be32(owner_grouplen);
-		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
+		p = xdr_encode_opaque(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -939,9 +936,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	p = reserve_space(xdr, 4 + create->name->len);
-	*p++ = cpu_to_be32(create->name->len);
-	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	encode_string(xdr, create->name->len, create->name->name);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1007,8 +1002,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1100,8 +1094,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1279,8 +1272,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1373,8 +1365,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1383,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 8 + oldname->len);
-	*p++ = cpu_to_be32(OP_RENAME);
-	*p++ = cpu_to_be32(oldname->len);
-	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
-
-	p = reserve_space(xdr, 4 + newname->len);
-	*p++ = cpu_to_be32(newname->len);
-	xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RENAME);
+	encode_string(xdr, oldname->len, oldname->name);
+	encode_string(xdr, newname->len, newname->name);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1587,8 +1574,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	/* authsys_parms rfc1831 */
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, machine_name, len);
+	p = xdr_encode_opaque(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p = cpu_to_be32(0);				/* No more gids */
-- 
cgit v0.10.2


From 6f723f7710024bb151ca8c5277ce8c71beec4db8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:37 +0300
Subject: nfs: nfs4xdr: get rid of READ32

s/READ32\((.*)\)/\1 = be32_to_cpup(p++)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index efa78ad..5f6b46f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,7 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ32(x)         (x) = ntohl(*p++)
 #define READ64(x)         do {			\
 	(x) = (u64)ntohl(*p++) << 32;		\
 	(x) |= ntohl(*p++);			\
@@ -2464,7 +2463,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*len);
+	*len = be32_to_cpup(p++);
 	READ_BUF(*len);
 	*string = (char *)p;
 	return 0;
@@ -2475,13 +2474,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	READ_BUF(8);
-	READ32(hdr->status);
-	READ32(hdr->taglen);
+	hdr->status = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p++);
 
 	READ_BUF(hdr->taglen + 4);
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	READ32(hdr->nops);
+	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2494,14 +2493,14 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
 			" %d but we issued a request for %d\n",
 				opnum, expected);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2524,14 +2523,14 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
 	READ_BUF((bmlen << 2));
 	if (bmlen > 0) {
-		READ32(bitmap[0]);
+		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			READ32(bitmap[1]);
+			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
 }
@@ -2541,7 +2540,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*attrlen);
+	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
 }
@@ -2567,7 +2566,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
 		READ_BUF(4);
-		READ32(*type);
+		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2625,7 +2624,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2641,7 +2640,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2679,7 +2678,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2695,7 +2694,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2796,7 +2795,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	int status = 0;
 
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2848,7 +2847,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2857,7 +2856,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
 		READ_BUF(4);
-		READ32(m);
+		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2928,7 +2927,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
 		READ_BUF(4);
-		READ32(*maxlink);
+		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -2945,7 +2944,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
 		READ_BUF(4);
-		READ32(*maxname);
+		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3005,7 +3004,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(tmp);
+		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3024,7 +3023,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
 		READ_BUF(4);
-		READ32(*nlink);
+		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3043,7 +3042,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
@@ -3071,7 +3070,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
@@ -3101,8 +3100,8 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		dev_t tmp;
 
 		READ_BUF(8);
-		READ32(major);
-		READ32(minor);
+		major = be32_to_cpup(p++);
+		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3191,7 +3190,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 
 	READ_BUF(12);
 	READ64(sec);
-	READ32(nsec);
+	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3273,7 +3272,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 	__be32 *p;
 
 	READ_BUF(20);
-	READ32(cinfo->atomic);
+	cinfo->atomic = be32_to_cpup(p++);
 	READ64(cinfo->before);
 	READ64(cinfo->after);
 	return 0;
@@ -3289,8 +3288,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(supp);
-	READ32(acc);
+	supp = be32_to_cpup(p++);
+	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3336,7 +3335,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -3591,7 +3590,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return status;
 
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3622,7 +3621,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	READ_BUF(32);
 	READ64(offset);
 	READ64(length);
-	READ32(type);
+	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3634,7 +3633,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	READ64(clientid);
-	READ32(namelen);
+	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
 }
@@ -3695,14 +3694,14 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	uint32_t limit_type, nblocks, blocksize;
 
 	READ_BUF(12);
-	READ32(limit_type);
+	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
 		READ64(*maxsize);
 		break;
 	case 2:
-		READ32(nblocks);
-		READ32(blocksize);
+		nblocks = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p++);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3714,14 +3713,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 
 	READ_BUF(4);
-	READ32(delegation_type);
+	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
 	}
 	READ_BUF(NFS4_STATEID_SIZE+4);
 	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
-	READ32(res->do_recall);
+	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3752,15 +3751,15 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	decode_change_info(xdr, &res->cinfo);
 
 	READ_BUF(8);
-	READ32(res->rflags);
-	READ32(bmlen);
+	res->rflags = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
 	READ_BUF(bmlen << 2);
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
-		READ32(res->attrset[i]);
+		res->attrset[i] = be32_to_cpup(p++);
 	for (; i < NFS4_BITMAP_SIZE; i++)
 		res->attrset[i] = 0;
 
@@ -3821,8 +3820,8 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(eof);
-	READ32(count);
+	eof = be32_to_cpup(p++);
+	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -3948,7 +3947,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 
 	/* Convert length of symlink */
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4070,7 +4069,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	if (status)
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -4082,13 +4081,13 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
 			" %d\n", opnum);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		READ64(clp->cl_clientid);
@@ -4098,12 +4097,12 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 
 		/* skip netid string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 
 		/* skip uaddr string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		return -NFSERR_CLID_INUSE;
 	} else
@@ -4127,8 +4126,8 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 		return status;
 
 	READ_BUF(16);
-	READ32(res->count);
-	READ32(res->verf->committed);
+	res->count = be32_to_cpup(p++);
+	res->verf->committed = be32_to_cpup(p++);
 	COPYMEM(res->verf->verifier, 8);
 	return 0;
 }
@@ -4154,11 +4153,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 	READ64(clp->cl_ex_clid);
 	READ_BUF(12);
-	READ32(clp->cl_seqid);
-	READ32(clp->cl_exchange_flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4167,17 +4166,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 
 	/* Throw away Major id */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away server_scope */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away Implementation id array */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	return 0;
@@ -4190,13 +4189,13 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	u32 nr_attrs;
 
 	READ_BUF(28);
-	READ32(attrs->headerpadsz);
-	READ32(attrs->max_rqst_sz);
-	READ32(attrs->max_resp_sz);
-	READ32(attrs->max_resp_sz_cached);
-	READ32(attrs->max_ops);
-	READ32(attrs->max_reqs);
-	READ32(nr_attrs);
+	attrs->headerpadsz = be32_to_cpup(p++);
+	attrs->max_rqst_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz_cached = be32_to_cpup(p++);
+	attrs->max_ops = be32_to_cpup(p++);
+	attrs->max_reqs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p++);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4226,8 +4225,8 @@ static int decode_create_session(struct xdr_stream *xdr,
 
 	/* seqid, flags */
 	READ_BUF(8);
-	READ32(clp->cl_seqid);
-	READ32(session->flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p++);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4275,23 +4274,23 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 	/* seqid */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out_err;
 	}
 	/* slot id */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != res->sr_slotid) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
 	/* highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* target highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v0.10.2


From 3ceb4dbb993fdab6a6fafc69db36686278871134 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:41 +0300
Subject: nfs: nfs4xdr: get rid of READ64

s/READ64\(\*(.*)\)/p = xdr_decode_hyper(p, \1)/
s/READ64\((.*)\)/p = xdr_decode_hyper(p, &\1)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f6b46f..238189c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,10 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ64(x)         do {			\
-	(x) = (u64)ntohl(*p++) << 32;		\
-	(x) |= ntohl(*p++);			\
-} while (0)
 #define READTIME(x)       do {			\
 	p++;					\
 	(x.tv_sec) = ntohl(*p++);		\
@@ -2588,7 +2584,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
 		READ_BUF(8);
-		READ64(*change);
+		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2607,7 +2603,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
 		READ_BUF(8);
-		READ64(*size);
+		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2658,8 +2654,8 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
 		READ_BUF(16);
-		READ64(fsid->major);
-		READ64(fsid->minor);
+		p = xdr_decode_hyper(p, &fsid->major);
+		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2711,7 +2707,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2729,7 +2725,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2747,7 +2743,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2764,7 +2760,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2781,7 +2777,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2910,7 +2906,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -2962,7 +2958,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
 		READ_BUF(8);
-		READ64(maxread);
+		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -2983,7 +2979,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
 		READ_BUF(8);
-		READ64(maxwrite);
+		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3122,7 +3118,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3139,7 +3135,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3156,7 +3152,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3173,7 +3169,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
 		READ_BUF(8);
-		READ64(*used);
+		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3189,7 +3185,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint32_t nsec;
 
 	READ_BUF(12);
-	READ64(sec);
+	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
@@ -3273,8 +3269,8 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 
 	READ_BUF(20);
 	cinfo->atomic = be32_to_cpup(p++);
-	READ64(cinfo->before);
-	READ64(cinfo->after);
+	p = xdr_decode_hyper(p, &cinfo->before);
+	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 }
 
@@ -3619,8 +3615,8 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	uint32_t namelen, type;
 
 	READ_BUF(32);
-	READ64(offset);
-	READ64(length);
+	p = xdr_decode_hyper(p, &offset);
+	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
@@ -3632,7 +3628,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 			fl->fl_type = F_RDLCK;
 		fl->fl_pid = 0;
 	}
-	READ64(clientid);
+	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
@@ -3697,7 +3693,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		READ64(*maxsize);
+		p = xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
@@ -4090,7 +4086,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
-		READ64(clp->cl_clientid);
+		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
@@ -4151,7 +4147,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	READ_BUF(8);
-	READ64(clp->cl_ex_clid);
+	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
 	READ_BUF(12);
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
-- 
cgit v0.10.2


From c816fd3406462702dee2e3859e70132c3aab7c10 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:44 +0300
Subject: nfs: nfs4xdr: get rid of READTIME

It has no users.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 238189c..0c26bb2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READTIME(x)       do {			\
-	p++;					\
-	(x.tv_sec) = ntohl(*p++);		\
-	(x.tv_nsec) = ntohl(*p++);		\
-} while (0)
 #define COPYMEM(x,nbytes) do {			\
 	memcpy((x), p, nbytes);			\
 	p += XDR_QUADLEN(nbytes);		\
-- 
cgit v0.10.2


From 686841b3cc3a71918b45ed148be7a01a4f10e3f8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:48 +0300
Subject: nfs: nfs4xdr: introduce print_overflow_msg

Part fo the nfs4xdr cleanup.  READ_BUF will go away.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0c26bb2..8255ec7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2441,14 +2441,18 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
-		dprintk("nfs: %s: prematurely hit end of receive" \
-				" buffer\n", __func__); \
-		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-				__func__, xdr->p, nbytes, xdr->end); \
+		print_overflow_msg(__func__, xdr); \
 		return -EIO; \
 	} \
 } while (0)
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	__be32 *p;
-- 
cgit v0.10.2


From 07d30434cfe2f1a1553143c6b20f1fe68d2ef80a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:52 +0300
Subject: nfs: nfs4xdr: introduce decode_opaque_fixed and decode_stateid
 helpers

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8255ec7..86e6983 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3290,19 +3290,34 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	return 0;
 }
 
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 {
 	__be32 *p;
+
+	p = xdr_inline_decode(xdr, len);
+	if (likely(p)) {
+		memcpy(buf, p, len);
+		return 0;
+	}
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
 	int status;
 
 	status = decode_op_hdr(xdr, OP_CLOSE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -3635,15 +3650,15 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == -EIO)
 		goto out;
 	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+		status = decode_stateid(xdr, &res->stateid);
+		if (unlikely(status))
+			goto out;
 	} else if (status == -NFS4ERR_DENIED)
 		status = decode_lock_denied(xdr, NULL);
 	if (res->open_seqid != NULL)
@@ -3664,16 +3679,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status != -EIO)
 		nfs_increment_lock_seqid(status, res->seqid);
-	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	}
+	if (status == 0)
+		status = decode_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -3706,6 +3718,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
 	__be32 *p;
 	uint32_t delegation_type;
+	int status;
 
 	READ_BUF(4);
 	delegation_type = be32_to_cpup(p++);
@@ -3713,8 +3726,10 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 		res->delegation_type = 0;
 		return 0;
 	}
-	READ_BUF(NFS4_STATEID_SIZE+4);
-	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
+	status = decode_stateid(xdr, &res->delegation);
+	if (unlikely(status))
+		return status;
+	READ_BUF(4);
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3738,10 +3753,10 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_op_hdr(xdr, OP_OPEN);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	if (unlikely(status))
 		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 
 	decode_change_info(xdr, &res->cinfo);
 
@@ -3766,32 +3781,26 @@ xdr_error:
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_putfh(struct xdr_stream *xdr)
-- 
cgit v0.10.2


From db942bbd09563e169cc5d9004c32c1de33220fd1 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:56 +0300
Subject: nfs: nfs4xdr: introduce decode_verifier helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Trond: Fixed up an 'uninitialised variable' issue in decode_readdir]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86e6983..404f2e6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3320,17 +3320,19 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	return status;
 }
 
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier, 8);
+}
+
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_COMMIT);
-	if (status)
-		return status;
-	READ_BUF(8);
-	COPYMEM(res->verf->verifier, 8);
-	return 0;
+	if (!status)
+		status = decode_verifier(xdr, res->verf->verifier);
+	return status;
 }
 
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3852,17 +3854,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
-	if (status)
+	if (!status)
+		status = decode_verifier(xdr, readdir->verifier.data);
+	if (unlikely(status))
 		return status;
-	READ_BUF(8);
-	COPYMEM(readdir->verifier.data, 8);
 	dprintk("%s: verifier = %08x:%08x\n",
 			__func__,
 			((u32 *)readdir->verifier.data)[0],
 			((u32 *)readdir->verifier.data)[1]);
 
 
-	hdrlen = (char *) p - (char *) iov->iov_base;
+	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
 	recvd = rcvbuf->len - hdrlen;
 	if (pglen > recvd)
 		pglen = recvd;
-- 
cgit v0.10.2


From e78291e4e07520348b0634095cf19ed3bc868965 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:00 +0300
Subject: nfs: nfs4xdr: introduce decode_sessionid helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 404f2e6..00630f4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4212,6 +4212,11 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	return 0;
 }
 
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
 static int decode_create_session(struct xdr_stream *xdr,
 				 struct nfs41_create_session_res *res)
 {
@@ -4221,14 +4226,11 @@ static int decode_create_session(struct xdr_stream *xdr,
 	struct nfs4_session *session = clp->cl_session;
 
 	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
-
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &session->sess_id);
+	if (unlikely(status))
 		return status;
 
-	/* sessionid */
-	READ_BUF(NFS4_MAX_SESSIONID_LEN);
-	COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
-
 	/* seqid, flags */
 	READ_BUF(8);
 	clp->cl_seqid = be32_to_cpup(p++);
@@ -4262,7 +4264,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		return 0;
 
 	status = decode_op_hdr(xdr, OP_SEQUENCE);
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &id);
+	if (unlikely(status))
 		goto out_err;
 
 	/*
@@ -4271,15 +4275,16 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 */
 	status = -ESERVERFAULT;
 
-	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
-	READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
-	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
 	if (memcmp(id.data, res->sr_session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out_err;
 	}
+
+	READ_BUF(20);
+
 	/* seqid */
+	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
 	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
-- 
cgit v0.10.2


From 99398d0655ada44ae464a1c93d13cd438a306ecd Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:05 +0300
Subject: nfs: nfs4xdr: get rid of COPYMEM

Just directly call memcpy.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 00630f4..f69aafc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define COPYMEM(x,nbytes) do {			\
-	memcpy((x), p, nbytes);			\
-	p += XDR_QUADLEN(nbytes);		\
-} while (0)
-
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
@@ -3607,7 +3602,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return -EIO;
 	fh->size = len;
 	READ_BUF(len);
-	COPYMEM(fh->data, len);
+	memcpy(fh->data, p, len);
 	return 0;
 }
 
@@ -4097,7 +4092,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
-		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
@@ -4134,7 +4129,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	READ_BUF(16);
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
-	COPYMEM(res->verf->verifier, 8);
+	memcpy(res->verf->verifier, p, 8);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 2460ba57c49c36dfef0b62c929461de09240fe17 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:10 +0300
Subject: nfs: nfs4xdr: simplify decode_exchange_id by reusing
 decode_opaque_inline

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f69aafc..3f49da0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4144,6 +4144,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	uint32_t dummy;
+	char *dummy_str;
 	int status;
 	struct nfs_client *clp = res->client;
 
@@ -4166,19 +4167,19 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 
 	/* Throw away Major id */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away server_scope */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away Implementation id array */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	return 0;
 }
-- 
cgit v0.10.2


From c0eae66ece40bdb8cd88c5106834b71a1c9f421c Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:14 +0300
Subject: nfs: nfs4xdr: get rid of READ_BUF

Use xdr_inline_decode instead.
Open code debug printout and error return.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3f49da0..b0d690c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2423,24 +2423,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ_BUF(nbytes)  do { \
-	p = xdr_inline_decode(xdr, nbytes); \
-	if (unlikely(!p)) { \
-		print_overflow_msg(__func__, xdr); \
-		return -EIO; \
-	} \
-} while (0)
-
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
 	dprintk("nfs: %s: prematurely hit end of receive buffer. "
@@ -2452,28 +2434,42 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*len = be32_to_cpup(p++);
-	READ_BUF(*len);
+	p = xdr_inline_decode(xdr, *len);
+	if (unlikely(!p))
+		goto out_overflow;
 	*string = (char *)p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
 	hdr->taglen = be32_to_cpup(p++);
 
-	READ_BUF(hdr->taglen + 4);
+	p = xdr_inline_decode(xdr, hdr->taglen + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
 	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2482,7 +2478,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
@@ -2494,6 +2492,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 /* Dummy routine */
@@ -2503,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 	unsigned int strlen;
 	char *str;
 
-	READ_BUF(12);
-	return decode_opaque_inline(xdr, &strlen, &str);
+	p = xdr_inline_decode(xdr, 12);
+	if (likely(p))
+		return decode_opaque_inline(xdr, &strlen, &str);
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2512,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	uint32_t bmlen;
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
-	READ_BUF((bmlen << 2));
+	p = xdr_inline_decode(xdr, (bmlen << 2));
+	if (unlikely(!p))
+		goto out_overflow;
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
 			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2555,7 +2571,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
@@ -2566,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	}
 	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2577,7 +2598,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
@@ -2585,6 +2608,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2596,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2613,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2629,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2647,7 +2688,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
-		READ_BUF(16);
+		p = xdr_inline_decode(xdr, 16);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
 		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -2657,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2700,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2718,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2736,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2753,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2770,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2784,7 +2865,9 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	__be32 *p;
 	int status = 0;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
@@ -2819,6 +2902,9 @@ out_eio:
 	dprintk(" status %d", status);
 	status = -EIO;
 	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2836,7 +2922,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
@@ -2845,7 +2933,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		u32 m;
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
@@ -2885,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
 out_eio:
 	status = -EIO;
 	goto out;
@@ -2899,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2916,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2933,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2951,7 +3058,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
@@ -2960,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	}
 	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2972,7 +3084,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
@@ -2981,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	}
 	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -2993,7 +3110,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3001,6 +3120,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3012,13 +3134,18 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
@@ -3031,9 +3158,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
@@ -3047,6 +3178,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
@@ -3059,9 +3193,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
@@ -3075,6 +3213,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3089,7 +3230,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
 		dev_t tmp;
 
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		major = be32_to_cpup(p++);
 		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
@@ -3100,6 +3243,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3111,12 +3257,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3128,12 +3279,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3145,12 +3301,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3162,7 +3323,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -3170,6 +3333,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3178,12 +3344,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint64_t sec;
 	uint32_t nsec;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3261,11 +3432,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 {
 	__be32 *p;
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
 	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3277,12 +3453,17 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	status = decode_op_hdr(xdr, OP_ACCESS);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	supp = be32_to_cpup(p++);
 	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
@@ -3341,10 +3522,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 		return status;
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3596,14 +3783,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	if (status)
 		return status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
-	READ_BUF(len);
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		goto out_overflow;
 	memcpy(fh->data, p, len);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3625,7 +3819,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	__be32 *p;
 	uint32_t namelen, type;
 
-	READ_BUF(32);
+	p = xdr_inline_decode(xdr, 32);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &offset);
 	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
@@ -3641,8 +3837,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	}
 	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
-	READ_BUF(namelen);
-	return -NFS4ERR_DENIED;
+	p = xdr_inline_decode(xdr, namelen);
+	if (likely(p))
+		return -NFS4ERR_DENIED;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -3697,7 +3897,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
@@ -3709,6 +3911,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3717,7 +3922,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 	int status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
@@ -3726,7 +3933,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3739,6 +3948,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3757,13 +3969,17 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 
 	decode_change_info(xdr, &res->cinfo);
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
 	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
-	READ_BUF(bmlen << 2);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (unlikely(!p))
+		goto out_overflow;
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
 		res->attrset[i] = be32_to_cpup(p++);
@@ -3774,6 +3990,9 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 xdr_error:
 	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -3820,7 +4039,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	status = decode_op_hdr(xdr, OP_READ);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
@@ -3835,6 +4056,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	res->eof = eof;
 	res->count = count;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3947,7 +4171,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 		return status;
 
 	/* Convert length of symlink */
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
@@ -3972,6 +4198,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	kaddr[len+rcvbuf->page_base] = '\0';
 	kunmap_atomic(kaddr, KM_USER0);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4069,10 +4298,16 @@ static int decode_setattr(struct xdr_stream *xdr)
 	status = decode_op_hdr(xdr, OP_SETATTR);
 	if (status)
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4081,7 +4316,9 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
@@ -4090,26 +4327,39 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	}
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
-		READ_BUF(8 + NFS4_VERIFIER_SIZE);
+		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
 		/* skip netid string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 
 		/* skip uaddr string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		return -NFSERR_CLID_INUSE;
 	} else
 		return nfs4_stat_to_errno(nfserr);
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4126,11 +4376,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	if (status)
 		return status;
 
-	READ_BUF(16);
+	p = xdr_inline_decode(xdr, 16);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
 	memcpy(res->verf->verifier, p, 8);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4152,9 +4407,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	if (status)
 		return status;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
@@ -4164,7 +4423,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return -EIO;
 
 	/* Throw away minor_id */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* Throw away Major id */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
@@ -4182,6 +4443,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4190,7 +4454,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	__be32 *p;
 	u32 nr_attrs;
 
-	READ_BUF(28);
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
 	attrs->headerpadsz = be32_to_cpup(p++);
 	attrs->max_rqst_sz = be32_to_cpup(p++);
 	attrs->max_resp_sz = be32_to_cpup(p++);
@@ -4203,9 +4469,15 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 			__func__, nr_attrs);
 		return -EINVAL;
 	}
-	if (nr_attrs == 1)
-		READ_BUF(4); /* skip rdma_attrs */
+	if (nr_attrs == 1) {
+		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+		if (unlikely(!p))
+			goto out_overflow;
+	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -4228,7 +4500,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 		return status;
 
 	/* seqid, flags */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	session->flags = be32_to_cpup(p++);
 
@@ -4237,6 +4511,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (!status)
 		status = decode_chan_attrs(xdr, &session->bc_attrs);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4277,7 +4554,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* seqid */
 	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
@@ -4302,6 +4581,10 @@ static int decode_sequence(struct xdr_stream *xdr,
 out_err:
 	res->sr_status = status;
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	status = -EIO;
+	goto out_err;
 #else  /* CONFIG_NFS_V4_1 */
 	return 0;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v0.10.2


From cccddf4f5580131c9b963900e1d3400655e633cc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:19 +0300
Subject: nfs: nfs4xdr: optimize low level decoding

do not increment decoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b0d690c..14b6f51 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2437,7 +2437,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*len = be32_to_cpup(p++);
+	*len = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, *len);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -2456,14 +2456,14 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
-	hdr->taglen = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, hdr->taglen + 4);
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	hdr->nops = be32_to_cpup(p++);
+	hdr->nops = be32_to_cpup(p);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2488,7 +2488,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 				opnum, expected);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2519,7 +2519,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 
 	bitmap[0] = bitmap[1] = 0;
 	p = xdr_inline_decode(xdr, (bmlen << 2));
@@ -2528,7 +2528,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			bitmap[1] = be32_to_cpup(p++);
+			bitmap[1] = be32_to_cpup(p);
 	}
 	return 0;
 out_overflow:
@@ -2543,7 +2543,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*attrlen = be32_to_cpup(p++);
+	*attrlen = be32_to_cpup(p);
 	*savep = xdr->p;
 	return 0;
 out_overflow:
@@ -2574,7 +2574,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*type = be32_to_cpup(p++);
+		*type = be32_to_cpup(p);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2601,7 +2601,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, change);
+		xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2625,7 +2625,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, size);
+		xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2647,7 +2647,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2668,7 +2668,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2692,7 +2692,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		if (unlikely(!p))
 			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
-		p = xdr_decode_hyper(p, &fsid->minor);
+		xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2716,7 +2716,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2737,7 +2737,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2759,7 +2759,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2782,7 +2782,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2805,7 +2805,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2827,7 +2827,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2849,7 +2849,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2868,7 +2868,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2925,7 +2925,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2936,7 +2936,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		m = be32_to_cpup(p++);
+		m = be32_to_cpup(p);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2994,7 +2994,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -3016,7 +3016,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxlink = be32_to_cpup(p++);
+		*maxlink = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -3038,7 +3038,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxname = be32_to_cpup(p++);
+		*maxname = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3061,7 +3061,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxread);
+		xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -3087,7 +3087,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxwrite);
+		xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3113,7 +3113,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		tmp = be32_to_cpup(p++);
+		tmp = be32_to_cpup(p);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3137,7 +3137,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*nlink = be32_to_cpup(p++);
+		*nlink = be32_to_cpup(p);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3161,7 +3161,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3196,7 +3196,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3234,7 +3234,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (unlikely(!p))
 			goto out_overflow;
 		major = be32_to_cpup(p++);
-		minor = be32_to_cpup(p++);
+		minor = be32_to_cpup(p);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3260,7 +3260,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3282,7 +3282,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3304,7 +3304,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3326,7 +3326,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, used);
+		xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3348,7 +3348,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	if (unlikely(!p))
 		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
-	nsec = be32_to_cpup(p++);
+	nsec = be32_to_cpup(p);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3437,7 +3437,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
-	p = xdr_decode_hyper(p, &cinfo->after);
+	xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -3457,7 +3457,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (unlikely(!p))
 		goto out_overflow;
 	supp = be32_to_cpup(p++);
-	acc = be32_to_cpup(p++);
+	acc = be32_to_cpup(p);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3525,7 +3525,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -3786,7 +3786,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3836,7 +3836,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	p = xdr_decode_hyper(p, &clientid);
-	namelen = be32_to_cpup(p++);
+	namelen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, namelen);
 	if (likely(p))
 		return -NFS4ERR_DENIED;
@@ -3903,11 +3903,11 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		p = xdr_decode_hyper(p, maxsize);
+		xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
-		blocksize = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3925,7 +3925,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	delegation_type = be32_to_cpup(p++);
+	delegation_type = be32_to_cpup(p);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
@@ -3936,7 +3936,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	res->do_recall = be32_to_cpup(p++);
+	res->do_recall = be32_to_cpup(p);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3973,7 +3973,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	if (unlikely(!p))
 		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	if (bmlen > 10)
 		goto xdr_error;
 
@@ -4043,7 +4043,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (unlikely(!p))
 		goto out_overflow;
 	eof = be32_to_cpup(p++);
-	count = be32_to_cpup(p++);
+	count = be32_to_cpup(p);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -4174,7 +4174,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4301,7 +4301,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -4325,7 +4325,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 			" %d\n", opnum);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr == NFS_OK) {
 		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
 		if (unlikely(!p))
@@ -4339,7 +4339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4348,7 +4348,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4410,7 +4410,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
+	xdr_decode_hyper(p, &clp->cl_ex_clid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -4418,7 +4418,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4463,7 +4463,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	attrs->max_resp_sz_cached = be32_to_cpup(p++);
 	attrs->max_ops = be32_to_cpup(p++);
 	attrs->max_reqs = be32_to_cpup(p++);
-	nr_attrs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4504,7 +4504,7 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (unlikely(!p))
 		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
-	session->flags = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4576,7 +4576,7 @@ static int decode_sequence(struct xdr_stream *xdr,
 	/* target highest slot id - currently not processed */
 	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v0.10.2